1 Main Body Figures
Here, I go through the creation of the different plots for the Figures
1.1 Figure 1
Figure 1. Assembly, annotation, and functional properties of the two metagenome-assembled genomes (MAGs) from the Swiss hard cheese starter culture RMK202. A) The Metagenome-assembled-genomes of S. thermophilus and L. delbrueckii with different genetic features highlighted (see legend). B) Functional properties potentially involved in the metabolic interaction of the two species. Filled red circles indicate presence, while empty circles indicate absence.
1.1.1 Circos plot
Here, I creat the circos plots for the bacteria. I will include the following information: 1. genome (circul) 2. forward genes
3. reverse genes 2.2. tRNA and rRNA colored 3.2. tRNA and rRNA colored 4. Pseudogenes 5. Prophage location 6. GC-skew (https://dbsloan.github.io/TS2019/exercises/circos.html#add-gc-skew-data-to-the-plot)
##==================
##Sterm
##==================
home=/home/vincent/bin/apps/circos-0.69-6/
cd $home
##organisational
#cp /home/vincent/bin/apps/circos-0.69-6/etc/ticks_nwc_1_ldel_both.conf /home/vincent/bin/apps/circos-0.69-6/etc/ticks_mag_both.conf
#cp /home/vincent/bin/apps/circos-0.69-6/etc/ideogram.conf /home/vincent/bin/apps/circos-0.69-6/etc/ideogram_mag.conf
mkdir -p $home/data/rmk202/MAG_rmk202_sterm/
##---------------------karyotyp
#seqlength.py "/home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202 (2).fasta" |head -1 > ${home}/data/karyotype/sterm_magg_rmk202.txt
##have to change the format a bit
##---------------------genes
cd /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm
/usr/bin/perl /home/vincent/miniconda3/bin/bp_genbank2gff3.pl S_thermophilus_RMK202.current.gb
cd $home
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="CDS" && $7=="+") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/genes_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="CDS" && $7=="-") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/genes_reverse.txt
##---------------------rRNA
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="rRNA" && $7=="+") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/rRNA_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="rRNA" && $7=="-") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/rRNA_reverse.txt
##---------------------tRNA
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="tRNA" && $7=="+") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/tRNA_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="tRNA" && $7=="-") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/tRNA_reverse.txt
##---------------------transposase from PGAP
grep "transposase" -i /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/transposase.txt
##---------------------pseudogenes
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="pseudogene" ) print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/pseudogenes.txt
#grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="tRNA" && $7=="-") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/tRNA_reverse.txt
##---------------------prophages
awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "S_thermophilus_mag_rmk202",$4,$5}' /home/vincent/Desktop/Projects/2019_RMK202_analysis/phage_annotation/phaster/ZZ_be76e5e924.PHASTER/prophage_summary_onlyGenome.gff > $home/data/rmk202/MAG_rmk202_sterm/prophages.txt
##---------------------protease
#only in l. delbrueckii
#I checked online on NCBI if the genomes contain PrtS or PrtB
#/home/vincent/Downloads/prtS_Sterm.fasta
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |grep "S8 family serine peptidase" |awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "S_thermophilus_mag_rmk202",$4,$5}'> $home/data/rmk202/MAG_rmk202_sterm/protease.txt
##---------------------transporter
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |grep "transporter" |awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/transporter.txt
##---------------------CRISPR arrays
#grep "SUMMARY BY POSITION" -A 50 $home/data/rmk202/MAG_rmk202_sterm/S_thermophilus_RMK202.pilarCR_out|grep "^====" -A 50 | sed '1d' | sed 's/^ *//g'| sed 's/ \{1,\}/\t/g'|awk -F "\t" '{OFS="\t"}{print "S_thermophilus_mag_rmk202",$3,$3+$4,"fill_color=blue"}' > $home/data/rmk202/MAG_rmk202_sterm/CRISPR.txt
##--cas genes
#grep "Cas" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff
grep "CRISPR" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="CDS" ) print "S_thermophilus_mag_rmk202",$4,$5,"color=chr3"}' > $home/data/rmk202/MAG_rmk202_sterm/CRISPR.txt
grep "CRISPR" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="repeat_region" ) print "S_thermophilus_mag_rmk202",$4,$5,"color=chr2"}' >> $home/data/rmk202/MAG_rmk202_sterm/CRISPR.txt
##---------------------gc skew
GCcalc.py -f "/home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202 (2).fasta" > $home/data/rmk202/MAG_rmk202_sterm/gc_skew_sterm.txt
awk -F "\t" '{OFS="\t"}{if($5>0) print $1,$2,$2,$5,"fill_color=blue" ; else print $1,$2,$2,$5,"fill_color=orange"}' $home/data/rmk202/MAG_rmk202_sterm/gc_skew_sterm.txt >
$home/data/rmk202/MAG_rmk202_sterm/gc_skew_sterm_cleaned.txt
##example
cp etc/repeat_nwc1_sterm_01.conf etc/sterm_rmk202_mag_circos.conf
#example run
bin/circos -conf etc/sterm_rmk202_mag_circos.conf -outputfile ./Sterm_mag_rmk202.svg; firefox ./Sterm_mag_rmk202.svg &
##==================
##Ldel
##==================
home=/home/vincent/bin/apps/circos-0.69-6/
cd $home
##organisational
#cp /home/vincent/bin/apps/circos-0.69-6/etc/ticks_nwc_1_ldel_both.conf /home/vincent/bin/apps/circos-0.69-6/etc/ticks_mag_both.conf ##already done
#cp /home/vincent/bin/apps/circos-0.69-6/etc/ideogram.conf /home/vincent/bin/apps/circos-0.69-6/etc/ideogram_mag.conf ##already done
#cp /home/vincent/bin/apps/circos-0.69-6/etc/sterm_rmk202_mag_circos.conf /home/vincent/bin/apps/circos-0.69-6/etc/ldel_rmk202_mag_circos.conf
mkdir -p $home/data/rmk202/MAG_rmk202_ldel/
##---------------------karyotyp
#seqlength.py /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.fasta |head -1 > ${home}/data/karyotype/ldel_magg_rmk202.txt
##have to change the format a bit
##---------------------genes
cd /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel
/usr/bin/perl /home/vincent/miniconda3/bin/bp_genbank2gff3.pl L_delbrueckii_RMK202.current.gb
cd $home
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="CDS" && $7=="+") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/genes_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="CDS" && $7=="-") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/genes_reverse.txt
##---------------------rRNA
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="rRNA" && $7=="+") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/rRNA_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="rRNA" && $7=="-") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/rRNA_reverse.txt
##---------------------transposase from PGAP
grep "transposase" -i /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/transposase.txt
##---------------------pseudogenes
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="pseudogene" ) print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/pseudogenes.txt
#grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="tRNA" && $7=="-") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/tRNA_reverse.txt
##---------------------prophages
awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "L_delbrueckii_mag_rmk202",$4,$5}' /home/vincent/Desktop/Projects/2019_RMK202_analysis/phage_annotation/phaster/ZZ_be76e5e924.PHASTER/prophage_summary_onlyGenome.gff > $home/data/rmk202/MAG_rmk202_ldel/prophages.txt
##---------------------CRISPR arrays
#grep "Cas" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff
grep "CRISPR" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="CDS" ) print "L_delbrueckii_mag_rmk202",$4,$5,"color=chr3"}' > $home/data/rmk202/MAG_rmk202_ldel/CRISPR.txt
grep "CRISPR" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="repeat_region" ) print "L_delbrueckii_mag_rmk202",$4,$5,"color=chr2"}' >> $home/data/rmk202/MAG_rmk202_ldel/CRISPR.txt
##---------------------protease
#only in l. delbrueckii
#I checked online on NCBI if the genomes contain PrtS or PrtB
#/home/vincent/Downloads/prtS_Sterm.fasta
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |grep "S8 family serine peptidase" |awk -F "\t" '{OFS="\t"}{if($1=="CP046131") print "L_delbrueckii_mag_rmk202",$4,$5}'> $home/data/rmk202/MAG_rmk202_ldel/protease.txt
##---------------------transporter
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |grep "transporter" |awk -F "\t" '{OFS="\t"}{if($1=="CP046131") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/transporter.txt
##---------------------gc skew
GCcalc.py -f /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.fasta > $home/data/rmk202/MAG_rmk202_ldel/gc_skew_ldel.txt
awk -F "\t" '{OFS="\t"}{if($5>0) print $1,$2,$2,$5,"fill_color=blue" ; else print $1,$2,$2,$5,"fill_color=orange"}' $home/data/rmk202/MAG_rmk202_ldel/gc_skew_ldel.txt > $home/data/rmk202/MAG_rmk202_ldel/gc_skew_ldel_cleaned.txt
##example
#cp etc/repeat_nwc1_sterm_01.conf etc/ldel_rmk202_mag_circos.conf
#example run
bin/circos -conf etc/ldel_rmk202_mag_circos.conf -outputfile ./ldel_mag_rmk202.svg; firefox ./ldel_mag_rmk202.svg &1.1.2 POGENOM
This is the analysis done with POGENOM We do this to get population genomic insights such as dN/dS ratios. Important to note is that the gff file should not contain any fasta entries.
VCFFILE=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/
#grep "^202-LMAG-1" /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/202-LMAG/PROKKA_04012020.gff |sed 's/^202-LMAG-1/CP046131/g' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff
#grep "^202-SMAG-1" /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/202-SMAG/PROKKA_04012020.gff |sed 's/^202-SMAG-1/CP046134/g' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff
grep "^#" -v /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/202-LMAG/PROKKA_04012020.gff |sed 's/^202-LMAG-1/CP046131/g' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff
grep "^#" -v /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/202-SMAG/PROKKA_04012020.gff |sed 's/^202-SMAG-1/CP046134/g' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff
awk -F "\t" '{OFS="\t"}{if($3=="CDS")print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_ready.gff
#GFF_file=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/MAG_annotation_onlyBAC_mod.gff
###--------------------------------
#RMK202
#Konserve_202
#Versand_202
#Lyo_202_2012
#lyo202_96
#Lyo_202_2014
#vcftools --vcf /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf --minQ 30 --remove-indels --recode --recode-INFO-all \
# --keep /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/SamplesMeta.txt --out \
# /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta
sed -e 's/\r/\n/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/MAG_annotation_onlyBAC_mod.gff > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/MAG_annotation_onlyBAC_mod_cleaned.gff
#GFF_file=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/MAG_annotation_onlyBAC_mod_cleaned.gff
GFF_file=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_ready.gff
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta
VCFFILE=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta.recode.vcf
grep "^#" -v ${GFF_file} |cut -f 1|sort|uniq -c
#tail ${GFF_file}
grep ">" ${Assembly}
grep "^#" -v ${VCFFILE} |cut -f 1|sort|uniq -c
##========================================
##Run POGENOME
##========================================
###-----------------
##Sterm
###-----------------
sed 's/^202-SMAG-1/CP046134/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/S_I_202_SMAG/PROKKA_08282020.gff|sed 's/>202-SMAG-1/CP046134/g' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_sterm_cleaned.gff
GFF_file_03=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_sterm_cleaned.gff
#genomes=S_M_202_SMAG
#echo "##gff-version 3" > /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#echo "##sequence-region 202-SMAG-1 1 1865439" >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#grep "^CP046134" /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}.gff |awk -F "\t" '{OFS="\t"}{if($3=="gene")print $0}' |sed 's/ID=.*locus_tag=/ID=/g' |sed 's/gene/CDS/g'|sed 's/;.*$//g'| awk '{print $0"_gene"}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#GFF_file_03=/archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#echo "##FASTA" >> ${GFF_file_03}
#cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/S_I_202_SMAG/PROKKA_08282020.fna |sed 's/>202-SMAG-1/CP046134/g' >> ${GFF_file_03}
#samtools faidx /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/fasta/${genomes}.fasta CP046134 > /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/fasta/${genomes}_forPOGENOM.fasta
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/
vcftools --vcf /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf --chr CP046134 --minQ 30 --remove-indels --recode --recode-INFO-all \
--keep /archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples_withEvolution.txt --out \
/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta_onlySterm
VCFFILE=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta_onlySterm.recode.vcf
rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/sterm/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/sterm/
perl /home/vincent/apps/POGENOM/POGENOM-0.8.1/pogenom.pl --vcf_file ${VCFFILE} --out /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/sterm/RunPogenome_all_rmk202_new --gff_file ${GFF_file_03} --genetic_code_file /home/vincent/apps/POGENOM/POGENOM-0.8.1/bacterial_genetic_code_table11_ncbi.txt
#--fasta_file /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/fasta/${genomes}_forPOGENOM.fasta
#--genome_size 1865459
###-----------------
##Ldel
###-----------------
sed 's/^202-LMAG-1/CP046131/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/L_I_202_LMAG/PROKKA_10022020.gff|sed 's/>202-LMAG-1/CP046131/g' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_ldel_cleaned.gff
GFF_file_03=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_ldel_cleaned.gff
#genomes=L_M_202_LMAG
#grep "^CP046131" /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}.gff |awk -F "\t" '{OFS="\t"}{if($3=="gene")print $0}' |sed 's/ID=.*locus_tag=/ID=/g' |sed 's/gene/CDS/g'|sed 's/;.*$//g'| awk '{print $0"_gene"}' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#GFF_file_03=/archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#echo "##FASTA" >> ${GFF_file_03}
#samtools faidx /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/fasta/${genomes}.fasta CP046131 |sed 's/>//g' >> ${GFF_file_03}
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/
vcftools --vcf /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf --chr CP046131 --minQ 30 --remove-indels --recode --recode-INFO-all \
--keep /archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples_withEvolution.txt --out \
/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta_onlyLdel
VCFFILE=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta_onlyLdel.recode.vcf
rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/Ldel/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/Ldel/
perl /home/vincent/apps/POGENOM/POGENOM-0.8.1/pogenom.pl --vcf_file ${VCFFILE} --out /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/Ldel/RunPogenome_all_rmk202_new --gff_file ${GFF_file_03} --genetic_code_file /home/vincent/apps/POGENOM/POGENOM-0.8.1/bacterial_genetic_code_table11_ncbi.txt
#--genome_size 2166765library(readr)
library(ggplot2)
library(lubridate)
library(tidyverse)
Genes_of_Interest <- read_delim("../data_zenodo/non_genomic_data//all_interaction_functioning_Genes.txt", "\t", escape_double = FALSE, col_names = c("species","gene","OG","numberGenomes"), trim_ws = TRUE) %>% select(-numberGenomes)
OGs_Ldel <- read_delim("../data_zenodo/non_genomic_data/OGs_Ldel.txt", "\t", escape_double = FALSE, col_names = c("OG","Gene"), trim_ws = TRUE)
OGs_Sterm <- read_delim("../data_zenodo/non_genomic_data/OGs_Sterm.txt", "\t", escape_double = FALSE, col_names = c("OG","Gene"), trim_ws = TRUE)
Genes_of_Interest_Ldel <- Genes_of_Interest %>% filter(species=="Ldel")
Genes_of_Interest_Sterm <- Genes_of_Interest %>% filter(species=="Sterm")
GENES_Ldel <- merge(OGs_Ldel,Genes_of_Interest_Ldel,by="OG",all = TRUE)
# table(GENES_Ldel$gene)
GENES_Sterm <- merge(OGs_Sterm,Genes_of_Interest_Sterm,by="OG",all = TRUE)
# table(GENES_Sterm$gene)
table(GENES_Ldel_extended$gene)
RunPogenome_Ldel_pNpS_per_gene <- read_delim("../data_zenodo/non_genomic_data/RunPogenome_Ldel.pNpS-per-gene.txt", "\t", escape_double = FALSE, trim_ws = TRUE) %>% select(-c(`RMK202 pNpS`,`Konserve_202 pNpS`,`Versand_202 pNpS`,`Lyo_202_2012 pNpS`,`lyo202_96 pNpS`,`Lyo_202_2014 pNpS`))
RunPogenome_Sterm_pNpS_per_gene <- read_delim("../data_zenodo/non_genomic_data/RunPogenome_Sterm.pNpS-per-gene.txt", "\t", escape_double = FALSE, trim_ws = TRUE) %>% select(-c(`RMK202 pNpS`,`Konserve_202 pNpS`,`Versand_202 pNpS`,`Lyo_202_2012 pNpS`,`lyo202_96 pNpS`,`Lyo_202_2014 pNpS`))
ggplot(RunPogenome_Sterm_pNpS_per_gene,aes(x=`All_samples_combined pNpS`))+geom_density()
ggplot(RunPogenome_Ldel_pNpS_per_gene,aes(x=`All_samples_combined pNpS`))+geom_density()
all_interaction_functioning_Genes_cleaned <- read_delim("../data_zenodo/non_genomic_data/all_interaction_functioning_Genes_cleaned.txt", "\t", escape_double = FALSE, col_names = c("geness","Name"), trim_ws = TRUE)
all_interaction_functioning_Genes_cleaned$gene <- str_split_fixed(all_interaction_functioning_Genes_cleaned$geness, fixed("_"), 2)[,2]
all_interaction_functioning_Genes_cleaned$species <- str_split_fixed(all_interaction_functioning_Genes_cleaned$geness, fixed("_"), 2)[,1]
all_interaction_functioning_Genes_cleaned <- all_interaction_functioning_Genes_cleaned %>% select(-geness)
GENES_Sterm_extended <- merge(RunPogenome_Sterm_pNpS_per_gene,all_interaction_functioning_Genes_cleaned,by.x="Gene",by.y="Name",all.x = TRUE)
GENES_Ldel_extended <- merge(RunPogenome_Ldel_pNpS_per_gene,all_interaction_functioning_Genes_cleaned,by.x="Gene",by.y="Name",all.x = TRUE)
# GENES_Ldel_extended <- merge(GENES_Ldel,RunPogenome_Ldel_pNpS_per_gene,by="Gene",all = TRUE)
# table(GENES_Ldel$gene)
# GENES_Sterm_extended <- merge(GENES_Sterm,RunPogenome_Sterm_pNpS_per_gene,by="Gene",all = TRUE)
# table(GENES_Sterm$gene)
GENES_Sterm_reduced <- GENES_Sterm_extended %>% filter(!is.na(gene)) %>% filter(!is.na(Num_loci))
GENES_Ldel_reduced <- GENES_Ldel_extended %>% filter(!is.na(gene)) %>% filter(!is.na(Num_loci))
GENES_Ldel_reduced$`All_samples_combined pNpS`[is.na(GENES_Ldel_reduced$`All_samples_combined pNpS`)] <- 0
GENES_Sterm_reduced$`All_samples_combined pNpS`[is.na(GENES_Sterm_reduced$`All_samples_combined pNpS`)] <- 0
GENES_Sterm_reduced_prep <- GENES_Sterm_reduced %>% select(c(gene,`All_samples_combined pNpS`,species,Num_loci))
GENES_ldel_reduced_prep <- GENES_Ldel_reduced %>% select(c(gene,`All_samples_combined pNpS`,species,Num_loci))
GENES_long <- rbind(GENES_Sterm_reduced_prep,GENES_ldel_reduced_prep) #%>% add_column(functioning="protocooperation\nrelated")
GENES_long$functioning <- ifelse(grepl("pep",GENES_long$gene),"peptidase","protocooperation\nrelated")
GENES_long <- GENES_long[
order( GENES_long[,5] ),
]
ggheatmap_CRISPR <- ggplot(GENES_long, aes(species, gene,color = `All_samples_combined pNpS`))+
# geom_tile(color = "white",size=1.1,shape="circle")+
geom_point(size=8,shape="circle")+
geom_point(shape = 1,size = 8,colour = "black")+
# theme_classic()+
# scale_fill_gradient2(low = "grey", high = "red",
# midpoint = 90, limit = c(80,100), space = "Lab",
# name="protein id",na.value = 'white',colour="black",pch=21) +
scale_color_gradient2(low = "red", high = "grey",
midpoint = 1, limit = c(0,2), space = "Lab",
name="pN/pS",na.value = 'white') +
# scale_fill_distiller(name = "SNPs", palette = "Blues", direction = -1)+
# scale_fill_viridis(alpha=0.8)+
labs(legend="CRISPR ID",x="",y="")+
# scale_fill_distiller(name = "SNPs", palette = "Viridis", direction = -1)+
# scale_fill_distiller(name = "ANI", palette = "Reds", direction = 1)+
theme_minimal()+ # minimal theme
theme(axis.text.y = element_text(size = 12),
axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# Print the heatmap
print(ggheatmap_CRISPR)
##===================================================
##plot
##===================================================
svg("../03_results//HEATMAP_interactionGenes_pN_pS.svg",width=3,height=9)
print(ggheatmap_CRISPR)
dev.off()
##===================================================
##distribution
##===================================================
GENES_Sterm_reverse <- GENES_Sterm_extended %>% filter(is.na(gene)) %>% filter(!is.na(Num_loci))
GENES_Ldel_reverse <- GENES_Ldel_extended %>% filter(is.na(gene)) %>% filter(!is.na(Num_loci))
GENES_Ldel_reverse$`All_samples_combined pNpS`[is.na(GENES_Ldel_reverse$`All_samples_combined pNpS`)] <- 0
GENES_Sterm_reverse$`All_samples_combined pNpS`[is.na(GENES_Sterm_reverse$`All_samples_combined pNpS`)] <- 0
GENES_Sterm_reverse_prep <- GENES_Sterm_reverse %>% select(c(gene,`All_samples_combined pNpS`,species,Num_loci))
GENES_ldel_reverse_prep <- GENES_Ldel_reverse %>% select(c(gene,`All_samples_combined pNpS`,species,Num_loci))
GENES_long_reverse <- rbind(GENES_Sterm_reverse_prep,GENES_ldel_reverse_prep) %>% add_column(functioning="other function")
GENES_complete <- rbind(GENES_long_reverse,GENES_long)
ggplot(GENES_complete,aes(x=`All_samples_combined pNpS`))+geom_density()+facet_wrap(~functioning)
ggplot(GENES_complete,aes(x=`All_samples_combined pNpS`,y=Num_loci))+geom_point()+facet_wrap(~functioning)
GENES_complete$functioning <- factor(GENES_complete$functioning, levels=c("protocooperation\nrelated","peptidase","other function"))
colorssss <- c("red","orange","grey88")
dnDSplot <- ggplot(GENES_complete,aes(x=`All_samples_combined pNpS`,y=Num_loci,fill=functioning,color=functioning))+geom_point(size=2)+theme_classic()+labs(x="pN/pS",y="Number of mutations")+scale_fill_manual(values = colorssss)+scale_color_manual(values = colorssss)+theme(legend.title = element_blank())
dnDSplot
svg("../03_results/dot_plot_pN_pS.svg",width=4.5,height=4)
dnDSplot
dev.off()1.2 Figure 2
Figure 2. Metagenomic sampling design and species abundance. A) The starter culture propagation scheme as applied in the cheese starter culture production. The samples subjected to metagenomic sequencing are indicated by darker colors and labelled with numbers. Every propagation cycle includes a freeze drying, reactivation, and working stock step. From the working stock, commercial starter cultures for weekly shipments to cheesemakers are produced. The propagation experiment was carried out in the same way as in production and in five replicates corresponding to samples 7-11.The numbers between the working stock (x) indicate the number of cycles in between. B) Relative abundance of the two bacterial species in the eleven starter cultures samples (1-6, historical samples; 7-11, replicates of the propagation experiment). C) Bacterial counts throughout the propagation experiment for both species and the five replicates (lines are colored according to species and points according to samples within the propagation cycle Fig. 2A). D) Acidification potential throughout the propagation experiment, as measured by pH reached after 18h incubation at 37°C in sterile milk..
1.2.1 taxon plot
Do this by looking at the coverage of S.thermophilus and L.delbrueckii in the different metagenomic samples.
###================
##description file
###================
threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples.txt ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples_withEvolution.txt ##the file with all sample names
###================
##bring all CDS gffs together
###================
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/
grep "^CP" -v /archiv/Projects/2019_RMK202_analysis/phage_annotation/phaster/ZZ_be76e5e924.PHASTER/gene_CLEANED_FINAL_ALL.gff |awk -F "\t" '{OFS="\t"}{print $1,$2,$3}' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/genes_for_Abundances.bed
grep "^#" -v /archiv/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |grep "^CP" |awk -F "\t" '{OFS="\t"}{if($3=="CDS")print $1,$4,$5}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/genes_for_Abundances.bed
grep "^#" -v /archiv/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |grep "^CP" |awk -F "\t" '{OFS="\t"}{if($3=="CDS")print $1,$4,$5}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/genes_for_Abundances.bed
###================
##mapping
###================
##============
##mapping to reference
##============
names=RMK202
rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/
for names in $(cat ${samplesss} )
do
echo ${num}"/16 :" ${names}
num=$((num+1))
bedtools coverage -bed -a /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/genes_for_Abundances.bed -b ${BaseLocation}/${names}/bwaMapping2DB/${names}_mapping2ref.bam > \
/archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/${names}2bacteriaDB.bed
#${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam
cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/${names}2bacteriaDB.bed |awk -F "[\t]" -v namess="$names" 'BEGIN{OFS="\t"}{print $0,namess}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/all_2bacteria_and_phages_from_MAG.bed
done
##===================================
#-------------file import
read_count <- read_delim("../data_zenodo/non_genomic_data/Coverage_bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)
# read_count <- read_delim("/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/all_2bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)
table(read_count$chr)
table(read_count$chr)
read_count$geneCoverage <- (read_count$count*600)/read_count$geneLength
library(dplyr)
# all_final <- read_count %>%
# group_by(sample,chr) %>%
# dplyr::summarize(median = median(geneCoverage))
#
all_final <- read_count %>%
group_by(sample,chr) %>%
dplyr::summarize(median = median(geneCoverage)) %>% filter(chr %in% c("CP046131","CP046134","Lactobacillus_phage_1","Streptococcus_phage_1","Streptococcus_phage_2"))
total_samples_sumTreatment <- aggregate(. ~sample, data=all_final[,c("sample","median")], sum, na.rm=TRUE)
all_final$total_coverage <- total_samples_sumTreatment[match(all_final$sample,total_samples_sumTreatment$sample),"median"]
all_final$percent_coverage <- 100*(all_final$median/all_final$total_coverage)
table(all_final$sample)
all_final <- all_final %>% filter(! sample %in% c("th_K2_8h","di_K2_6h"))
all_final$sample <- factor(all_final$sample, levels=(c("lyo202_96","Lyo_202_2012","Konserve_202","Lyo_202_2014","RMK202","Versand_202","G1_6_18","G2_6_18","G3_6_18","G4_6_18","G5_6_18")))
# table(total_samples$phage) %>% length()
write.table(all_final,"../03_results/coverage_rmk202.tsv",sep = "\t",quote = FALSE,col.names = FALSE)
write.table(all_final,"../03_results/coverage_rmk202_n32.tsv",sep = "\t",quote = FALSE,col.names = FALSE)
# all_colours <-c("#C5F6FA" ,"#6CF5A3" ,"#36E37B" ,"#10B552", "darkorange", "#FAA0A0","#EB4D4D")
all_colours <-c("#C5F6FA" ,"#6CF5A3" ,"#36E37B" ,"#10B552", "darkorange", "#FAA0A0","#EB4D4D")
##----------------change name
library(plyr)
library(dplyr)
all_final$sample <- revalue(all_final$sample, c("lyo202_96"="Lyo 1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018"))
##----------------plot
# all_final <- all_final %>% filter(!sample %in% c("cheesemaking\nday1","cheesemaking\nday2"))
levels(all_final$chr)
# all_final$species <- revalue(all_final$species, c("lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
#all_final$chr <- factor(all_final$chr, levels=c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202"))
all_final$chr <- factor(all_final$chr, levels=c("Lactobacillus_phage_2" , "Lactobacillus_phage_1" ,"CP046133", "CP046132","Streptococcus_phage_2","Streptococcus_phage_1","CP046135","CP046131","CP046134"))
# all_colours
all_colours_new <- c("#36E37A","#C5F6FA","#6CF5A3","#36E37B","orange","darkorange","#FAA0A0", "#10B552","#EB4D4D")
all_colours_new <- c("#dbece1","#a0cbd2","#6bf5a2","#66c264","#ffa300","#ff8a00","#ff5200", "#10B552","#EB4D4D")
all_colours_new <- c("#a0cbd2","#ffa300","#ff8a00", "#10B552","#EB4D4D")
# c("#C5F6FA","#6CF5A3","#36E37B", "#10B552","darkorange","#FAA0A0","#EB4D4D")
# c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202")
##===============================
##only Bacteria
##===============================
all_final_bacteria
total_samples_sumTreatment <- aggregate(. ~sample, data=all_final_bacteria[,c("sample","median")], sum, na.rm=TRUE)
all_final_bacteria$total_coverage <- total_samples_sumTreatment[match(all_final_bacteria$sample,total_samples_sumTreatment$sample),"median"]
all_final_bacteria$percent_coverage <- 100*(all_final_bacteria$median/all_final_bacteria$total_coverage)
# all_final$chr <- factor(all_final$chr, levels=c("Lactobacillus_phage_2" , "Lactobacillus_phage_1" ,"CP046133", "CP046132","Streptococcus_phage_2","Streptococcus_phage_1","CP046135","CP046131","CP046134"))
# all_colours
# all_colours_new <- c("#36E37A","#C5F6FA","#6CF5A3","#36E37B","orange","darkorange","#FAA0A0", "#10B552","#EB4D4D")
all_colours_new <- c( "#10B552","#EB4D4D")
# c("#C5F6FA","#6CF5A3","#36E37B", "#10B552","darkorange","#FAA0A0","#EB4D4D")
# c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202")
PrelAbundance_bacteria <- ggplot( data = all_final_bacteria,aes(y = percent_coverage, x = sample, group=interaction(chr),fill = chr))+ geom_bar( stat="identity")+
labs("",
x="",
y="relative abundance")+
theme_classic()+
# scale_color_viridis(discrete=TRUE)+
scale_fill_manual(values=all_colours_new)+
# scale_fill_viridis(discrete=TRUE)+
theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
# axis.text.x = element_blank(),
legend.position="right",
#legend.justification=c(1,1), legend.position=c(1,1),
legend.title = element_blank()
)
library(patchwork)
PrelAbundance_bacteria
svg("../03_results/relative_abundance_all.svg",width=10,height=4.5)
# # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
#
(PrelAbundance+theme(legend.position = "none"))+(PrelAbundance_bacteria+theme(legend.position = "none"))+PrelAbundance
#
dev.off()1.2.2 propagation experiment
In this project we have created a propagation experiment. Here we analyse the phenotypic data.
rmk202_test_cellCounts <- read_csv("../data_zenodo/non_genomic_data/rmk202_propagation_experiment_data.csv",skip = 2) %>% filter(sample!="sample") %>% select(c("sample","step","pH","average BM","average SR.9.3","calculated Ldel","generations_all" ,"generations_sterm","generations_ldel","cummulative_generation_all" ,"cummulative_generation_sterm","cummulative_generation_ldel","doublingTime_all","doublingTime_sterm","doublingTime_ldel","survival_rate_all","survival_rate_sterm","survival_rate_ldel" ))
rmk202_test_cellCounts$sampleNAME <- str_split_fixed(rmk202_test_cellCounts$sample, fixed("_"), 3)[,1]
rmk202_test_cellCounts$week <- str_split_fixed(rmk202_test_cellCounts$sample, fixed("_"), 3)[,2]
rmk202_test_cellCounts$passage <- str_split_fixed(rmk202_test_cellCounts$sample, fixed("_"), 3)[,3]
###------------------------cumulative generation per species at end
cumGenData <- rmk202_test_cellCounts %>% filter(passage==18) %>% select(sample, cummulative_generation_sterm, cummulative_generation_ldel) %>% gather(.,feature, generations,c("cummulative_generation_sterm", "cummulative_generation_ldel"), factor_key=TRUE,na.rm = TRUE)
cumGenData$feature <- plyr::revalue(cumGenData$feature, c("cummulative_generation_sterm"="S.thermophilus","cummulative_generation_ldel"="L. delbrueckii"))
colorsSpecies <- (c("#EB4D4D","#10B552") )
cumGenPLOT <- ggplot(cumGenData,aes(x=feature,group=feature,y=generations,fill=feature))+geom_boxplot()+theme_classic()+labs(y="number of generations\n at the end",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values=colorsSpecies)
cumGenPLOT
###------------------------number of generations per species
GenData <- rmk202_test_cellCounts %>% filter(passage!=1) %>% select(sample, generations_sterm, generations_ldel) %>% gather(.,feature, generations,c("generations_sterm", "generations_ldel"), factor_key=TRUE,na.rm = TRUE)
GenData$feature <- plyr::revalue(GenData$feature, c("generations_sterm"="S.thermophilus","generations_ldel"="L. delbrueckii"))
GenPLOT <- ggplot(GenData,aes(x=feature,group=feature,y=generations,fill=feature))+geom_boxplot()+theme_classic()+labs(y="number of generations\n per propagation step",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values=colorsSpecies)
GenPLOT
###------------------------number of generations per species
GenData <- rmk202_test_cellCounts %>% filter(passage!=1) %>% select(sample, generations_sterm, generations_ldel) %>% gather(.,feature, generations,c("generations_sterm", "generations_ldel"), factor_key=TRUE,na.rm = TRUE)
GenData$feature <- plyr::revalue(GenData$feature, c("generations_sterm"="S.thermophilus","generations_ldel"="L. delbrueckii"))
mean(GenData$generations)
GenPLOT <- ggplot(GenData,aes(x=feature,group=feature,y=generations,fill=feature))+geom_boxplot()+theme_classic()+labs(y="Number of generations\nRelative abundance step",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values=colorsSpecies)
GenPLOT
GenData %>%
group_by(feature) %>%
dplyr::summarize(median = mean(generations))
###------------------------average CFU after second passage
colnames(rmk202_test_cellCounts)
CountData <- rmk202_test_cellCounts %>% filter(step=="second_passage") %>% select("sample", "average SR.9.3", "calculated Ldel") %>% gather(.,feature, generations,c("average SR.9.3", "calculated Ldel"), factor_key=TRUE,na.rm = TRUE)
CountData$feature <- plyr::revalue(CountData$feature, c("average SR.9.3"="S.thermophilus","calculated Ldel"="L. delbrueckii"))
CountPLOT <- ggplot(CountData,aes(x=feature,group=feature,y=generations,fill=feature))+geom_boxplot()+theme_classic()+labs(y="Bacterial count of\nworking stock [CFU/ml]",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values=colorsSpecies)
CountPLOT
CountData %>%
group_by(feature) %>%
dplyr::summarize(median = mean(generations))
CountData %>%
group_by(feature) %>%
dplyr::summarize(median = mean(generations),sd = sd(generations))
###------------------------death rate after freeze drying
rmk202_test_cellCounts$step
SurvivalData <- rmk202_test_cellCounts %>% filter(step=="freeze_dry") %>% select("sample", "survival_rate_all") %>% add_column(species="all")
# CountData$feature <- plyr::revalue(CountData$feature, c("average SR.9.3"="S.thermophilus","calculated Ldel"="L. delbrueckii"))
SurvivalPLOT <- ggplot(SurvivalData,aes(x=species,group=species,y=survival_rate_all,fill=species))+geom_boxplot()+theme_classic()+labs(y="Survivial rate\n [%]",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values="grey88")
SurvivalPLOT
SurvivalData %>%
group_by(species) %>%
dplyr::summarize(median = mean(survival_rate_all))
###------------------------CFU over time
rmk202_test_cellCounts$cummulative_generation_all
CFUData <- rmk202_test_cellCounts %>% filter(passage!="17") %>% filter(step!="freeze_dry") %>% select("sample","step","cummulative_generation_all","sampleNAME","passage", "average SR.9.3", "calculated Ldel") %>% add_column(species="all")%>% gather(.,feature, generations,c("average SR.9.3", "calculated Ldel"), factor_key=TRUE,na.rm = TRUE)
# CFUData[which(is.na(CFUData$cummulative_generation_all)),]
CFUData[which(is.na(CFUData$cummulative_generation_all)),"cummulative_generation_all"] <- 0
CFUData$feature <- plyr::revalue(CFUData$feature, c("average SR.9.3"="S.thermophilus","calculated Ldel"="L. delbrueckii"))
CFUData$passage <- as.double(CFUData$passage)
CFUData$sampleNAME <- as.factor(CFUData$sampleNAME)
CFUPLOT <- ggplot(CFUData,aes(x=cummulative_generation_all,group=sampleNAME,y=generations))+facet_wrap(~feature,scales="free_y",ncol=1)+geom_point(aes(color=step),size=2)+geom_line(aes(color=feature),alpha=0.4,size=1)+theme_classic()+labs(y="Bacterial count\n[CFU/ml]",x="generations")+theme(axis.text.x = element_text(size=9),legend.position = "top",legend.title = element_blank())+scale_fill_manual(values=colorsSpecies)+scale_color_manual(values=c("start"="black","first_passage"="#d9d9d9","second_passage"="#b3b3ff","freeze_dry"="#ff0101","S.thermophilus"="#EB4D4D","L. delbrueckii"="#10B552"))
CFUPLOT
###------------------------pH over time
pHData <- rmk202_test_cellCounts %>% filter(step!="freeze_dry")%>% select("sample","step","sampleNAME","cummulative_generation_all", "pH")
pHData[which(is.na(pHData$cummulative_generation_all)),"cummulative_generation_all"] <- 0
# pHData$passage <- as.double(pHData$passage)
pHData$sampleNAME <- as.factor(pHData$sampleNAME)
pHPLOT <- ggplot(pHData,aes(x=cummulative_generation_all,group=sampleNAME,y=pH))+geom_point(aes(color=step),size=2)+geom_line(color="grey",alpha=0.4)+theme_classic()+labs(y="pH",x="generations")+theme(axis.text.x = element_text(size=9),legend.position = "none")+scale_color_manual(values=c("start"="black","first_passage"="#d9d9d9","second_passage"="#b3b3ff","freeze_dry"="#ff0101","S.thermophilus"="#EB4D4D","L. delbrueckii"="#10B552"))
pHPLOT
pHData %>%
dplyr::summarize(median = mean(pH),sd = sd(pH))
###------------------------put figurers together
plot1 <- SurvivalPLOT+CountPLOT+GenPLOT+cumGenPLOT+plot_layout(ncol = 4,widths = c(0.5,1,1,1))
plotFinal <- CFUPLOT+plot1+pHPLOT+plot_layout(nrow = 3,heights = c(1,1,0.5))
plotFinal
svg("../03_results/evolutionExperiment_plot.svg",width=6,height=8)
plotFinal
dev.off()1.3 Figure 3
Figure 3. Strain-level diversity of S. thermophilus in cheese starter cultures. A) Alternative allele frequencies of all S. thermophilus SNVs over the metagenomic samples. Recurring SNVs from different samples are connected with a line. Clustering of lines indicates a large amount of SNVs with similar frequencies suggesting genomic coupling. Sample labels on the x-axis correspond to samples highlighted in Fig. 2A). Phylogeny of the isolated S. thermophilus strains based on maximum likelihood analysis on 1788 core genes. The isolates split into four lineages indicated by different color shadings. Strains sequenced with Nanopore are labelled with an asterisk. Values on branches indicate bootstrap values (100 replicates). C) Relative abundance of each of the four sub-lineages of S. thermophilus across the eleven metagenomes as based on the average frequency of lineage-specific SNVs identified on the basis of the isolates in Fig 3B.
1.3.1 SNV calling and phasing
This is a very extensive analysis. It consist out of:
- mapping raw reads with bwa mem
- calling and filtering SNVs with freebayes and vcftools
- plot the alternative allel frequency plot with r
###================
##description file
###================
threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly_phagesOrientated.fasta
##!!!!!!!!!!!!!change names of Forward and Reverse reads
names=G4_6_18
r1=/home/vincent/Projects/2020_StarterEvolution/01_data/20200929_Novogene/X204SC20090774-Z01-F001/trimm_Galore/gz/${names}/${names}-R1_val_1.fq
r2=/home/vincent/Projects/2020_StarterEvolution/01_data/20200929_Novogene/X204SC20090774-Z01-F001/trimm_Galore/gz/${names}/${names}-R2_val_2.fq
###================
##merge genome
#there was a problem in the G4 sample so I had to change the fastq input
###================
#cp /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples_G4_corrected.txt
#/archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/G4_6_18/G4_6_18/G4_6_18-*_kneaddata_paired_1.fastq
#/archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/G4_6_18/G4_6_18/G4_6_18-*_kneaddata_paired_2.fastq
###================
##script
###================
##============
##mapping to reference
##============
bwa index $Assembly
# name_folder=02_againstpolished_single_rmk202_final_kneaddata_withStrains
mkdir -p ${logFilelocation}
num=1
for names in $(cut -f 1 /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt)
#for names in $(ls /archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/ |grep "^G"|head -2)
do
echo ${num}"/16 :" ${names}
num=$((num+1))
rm -r ${BaseLocation}/${names}/bwaMapping2DB/
mkdir -p ${BaseLocation}/${names}/bwaMapping2DB/
r1=$(grep "^${names}" /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt |cut -f 2)
r2=$(grep "^${names}" /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt |cut -f 3)
# bwa mem -t ${threads} /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids/di_rmk202_MAG_reference_polished.fasta
bwa mem -t ${threads} ${Assembly} \
${r1} ${r2} | samtools sort -@${threads} -O BAM -o ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam -
samtools view -b -f 4 ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam > ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted_unmapped.bam
bedtools bamtofastq -i ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted_unmapped.bam -fq ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted_unmapped.fq
rm ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted_unmapped.bam &
##-----Qualimap
mkdir -p ${BaseLocation}/${names}/bwaMapping2DB//bamqc
qualimap bamqc -bam ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam -outdir ${BaseLocation}/${names}/bwaMapping2DB//bamqc --java-mem-size=80G
mkdir -p ${BaseLocation}/log
echo ${BaseLocation}"/"${names}"/bwaMapping2DB/bamqc" >> ${BaseLocation}/log/multiqc_logfile_finalGenome.txt
done
##-------------------------------------------------
##-----mmaping depth
##-------------------------------------------------
rm ${BaseLocation}/complete_depth.txt
for names in $(cut -f 1 /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt)
do
echo $names
#samtools depth -a ${BaseLocation}/${names}/bwaMapping2DB/${names}_mapping2ref.bam | grep "^Streptococcus_ph" |awk -F "\t" -v sampsss="$names" '{OFS="\t"}{print $0,sampsss}' >> ${BaseLocation}/complete_depth.txt
samtools depth -a ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam | grep "^Streptococcus_ph" |awk -F "\t" -v sampsss="$names" '{OFS="\t"}{print $0,sampsss}' >> ${BaseLocation}/complete_depth.txt
done
##-------------------------------------------------
##-----check read coverage
##-------------------------------------------------
name_folder=02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final
threads=37
names=Konserve_202
num=1
rm -r ${BaseLocation}/log/mappings.txt
#for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt)
for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples.txt)
do
mapped=$(grep " number of mapped reads = " ${BaseLocation}/${names}/bwaMapping2DB//bamqc/genome_results.txt |cut -d '(' -f 2 | sed 's/%)//g')
mkdir -p ${BaseLocation}/log
echo -e ${names}"\tonlyMeta\t"${mapped} >> ${BaseLocation}/log/mappings.txt
doneThis is a good (blog)[https://bioinformatics-core-shared-training.github.io/cruk-summer-school-2017/Day2/vcf-intro.nb.html] on how to use freebayes and how to parse the output. Finally I will do freebayes on all samples simultaniously.
Interesting output from column 11 on:
GT Genotype v :1 DP Read Depth :1040 DPR Number of observation for each allele : 1,1037 RO Reference allele observation count :1 QR Sum of quality of the reference observations :10 AO Alternate allele observation count :1037 QA Sum of quality of the alternate observations :36917 GL Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy
Very nice blog illustrating good filtering practices.
Here we do snpEff.
###================
##description file
###================
threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/di_rmk202_MAG_reference_polished.fasta
FREEBAYES_out=freebayesOuput_WithONT_Parallel_default
sampleShort=PROKKA_meta_all ##For st_thermophilus
threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta
##!!!!!!!!!!!!!change names of Forward and Reverse reads
##=======================================================================================================================
##-----------------------------------------------all together----------------------------------------------
##=======================================================================================================================
##==================
##Building Database
##==================
#/archiv/Projects/2018_Culturomics/genomes/Lactobacillus_delbrueckii_RMK202.fna
#/archiv/Projects/2018_Culturomics/genomes/Streptococcus_thermophilus_RMK202.fna
##-------------
##01_add genome name to
##-------------
vim /home/vincent/miniconda2/share/snpeff-4.3.1t-2/snpEff.config
#S_thermophilus_mag_rmk202.genome : S_thermophilus_mag_rmk202
#L_delbrueckii_mag_rmk202.genome : L_delbrueckii_mag_rmk202
#Streptococcus_phage_rmk202.genome : Streptococcus_phage_rmk202
#PGAP_meta_all.genome : PGAP_meta_all
PROKKA_meta_all.genome : PROKKA_meta_all
##-------------
#02_put genome fasta and gff into right directory:
##-------------
#sampleShort=S_thermophilus_mag_rmk202
#sampleShort=CP046134 ##For st_thermophilus
#sampleShort=PROKKA_meta_all/
sampleShort=PGAP_meta_all
#/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/circlatorAfter//circlaring_ldel/final/L_delbrueckii_mag_rmk202.fasta
##-------------
#03_put genome fasta
##-------------
rm -r /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}
mkdir -p /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}
cat ${Assembly} > /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}/sequences.fa
##-------------
#04 genes gff
##-------------
#grep "^#" -v /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/PGAP_assembly/sterm/S_thermophilus_RMK202.current.gff|grep -e "${sampleShort}" |cut -f 1|sort|uniq -c
#cat /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/PGAP_assembly/sterm/S_thermophilus_RMK202.current.gff | grep -e "^${sampleShort}" |awk -F "\t" '{OFS"\t"}{if($3=="CDS") print $0}' > /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}//genes.gff
##-------------
#04 all genes gff
##-------------
##already preped in RAST chunk
#cat /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/spades_nonMapping_assembly/annotation_eggnog_rast/assembly_20191129/merged/all_genes_rmk202.gff |sed 's/gene/CDS/g' > /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}//genes.gff
awk -F "\t" '{OFS="\t"}{if( $3=="CDS")print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/202-SMAG/PROKKA_04012020.gff|sed 's/^202-SMAG-1/CP046134/g' > /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}//genes.gff
awk -F "\t" '{OFS="\t"}{if( $3=="CDS")print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel//202-LMAG/PROKKA_04012020.gff|sed 's/^202-LMAG-1/CP046131/g' >> /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}//genes.gff
##-------------
##05_build database
##-------------
cd /home/vincent/miniconda2/share/snpeff-4.3.1t-2/
java -jar snpEff.jar build -gff3 -v ${sampleShort}
##==================
##running snpEff
##==================
##-------------
##06_preperation of SNV file
##-------------
#grep -e "${sampleShort}" ${BaseLocation}/${FREEBAYES_out}/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf > ${BaseLocation}/${FREEBAYES_out}/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_all.vcf
##-------------
##07 SNveff calling
##-------------
cd /home/vincent/miniconda2/share/snpeff-4.3.1t-2/
#rm -r ${sampleShort} ${BaseLocation}/${FREEBAYES_out}/SnpEff/${sampleShort}/
mkdir -p ${sampleShort} ${BaseLocation}/${FREEBAYES_out}/SnpEff/${sampleShort}/
#java -Xmx64g -jar /home/vincent/miniconda2/share/snpeff-4.3.1t-2/snpEff.jar ${sampleShort} /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf > ${BaseLocation}/${FREEBAYES_out}/SnpEff/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_snpeff.vcf
grep "phage" -v /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf > /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode_wophage.vcf
head -1 /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_prepR.vcf|sed 's/\t/\n/g'|sed '1,9d' > /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/samples_names.txt
java -Xmx64g -jar /home/vincent/miniconda2/share/snpeff-4.3.1t-2/snpEff.jar ${sampleShort} \
/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode_wophage.vcf > /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default//variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_snpeff.vcf
##-------------
##08_prep for R
##-------------
grep "^#" -v /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default//variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_snpeff.vcf | \
awk -F "\t" 'BEGIN{OFS="\t"} {print $1,$2,$3,$4,$5,$6,$7,$8}' > /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default//variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_snpeff_prepforR.vcf
##==================
##transfer local
##==================
threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/di_rmk202_MAG_reference_polished.fasta
FREEBAYES_out=freebayesOuput_WithONT_Parallel_defaulthere we merge all annotation knowledge we have of the SNPs:
- SNPeffect
- eggnog annotation
- repeat annotation
- core genes
Thereafter we have a long list of information for every SNP for which we can go into R.
Hereafter we go into R and do the following filtering.
The coverage of the alternative allel must be at least 3 or zero.
We have to be careful when removing low quality SNP calls. As they seem to be not not low quality in the sense of low coverage but rather suported by only a fraction of samples. This makes sense for many snps which are only valied for Sterm and have NAs in the Ldel genomes. Therefore we rather select the SNP by selecting for a minimum of coverage supporting the SNP per sample.
!!IMPORTANT!! Before starting we have to add the following to the SNP.vcf:
- SNPeff
- if core or not with roary and interesect
- (if repeat within repeat)
library(tidyverse)
library(vcfR)
library(readr)
library(plyr)
library(dplyr)
library(ggplot2)
library(xlsx)
##==================
##01_import data
##==================
#snps_freebayes <- read_delim("/home/vincent/Desktop/Projects/2019_RMK202_analysis/06_snpCalling2ONT/variantCallsfreebayes_all_f_005_min_20x_prepforR.vcf", "\t", escape_double = FALSE, trim_ws = TRUE)
# snps_freebayes <- read_delim("/home/vincent/Desktop/Projects/2019_RMK202_analysis/06_snpCalling2ONT_new/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_prepR_new.vcf", "\t", escape_double = FALSE, trim_ws = TRUE)
snps_freebayes <- read_delim("../data_zenodo/non_genomic_data/variantCallsfreebayes.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
../data_zenodo/non_genomic_data/Coverage_bacteria_and_phages_from_MAG.bed
table(snps_freebayes$`#CHROM`)
snps_freebayes$MQM <- str_split_fixed(snps_freebayes$INFO, ";", 130)[,16] %>% gsub("MQM=","",.)
snps_freebayes$MQM <- str_split_fixed(snps_freebayes$MQM, ",", 3)[,1] %>% as.numeric()
# colnames(snps_freebayes)
# snps_freebayes[snps_freebayes$QUAL<=20,]
# snps_freebayes <- snps_freebayes[snps_freebayes$QUAL>20,]
# dim(snps_freebayes_new)
# dim(snps_freebayes)
# snps_freebayes <- subset(snps_freebayes, select=-c(di_K1_10h,th_K1_8h))
##==================
##01_filter SNVs MQM larger than 30
##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
##==================
tmp_remove <- snps_freebayes[which(snps_freebayes$MQM<30),]
table(tmp_remove$`#CHROM`)
snps_freebayes <- snps_freebayes[which(snps_freebayes$MQM>30),] %>% select(-MQM)
table(snps_freebayes$`#CHROM`)
##==================
##01_filter SNVs MQM larger than 30
##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
##==================
##==================
##02_long list and frequency calc
##==================
table(snps_freebayes$`#CHROM`)
##wide2long
# snps_freebayes[,10]
snps_freebayes_long <- gather(snps_freebayes, sample, Snps, colnames(snps_freebayes)[10]:colnames(snps_freebayes)[ncol(snps_freebayes)], factor_key=TRUE,na.rm = TRUE)
nrow(snps_freebayes_long)
table(snps_freebayes_long$sample)
###------------------------remove non-called snps
snps_freebayes_long_cleaned <- snps_freebayes_long[snps_freebayes_long$Snps!=".",]
nrow(snps_freebayes_long_cleaned)
# snps_freebayes_long[c(7438,7437,7439, 7440, 7441, 7442, 7443, 7444, 7445),]
###------------------------calculate allel frequency
sample_relative_temp <-snps_freebayes_long_cleaned %>% separate("Snps", into=c("GT","DP","AD","RO","QR","AO","QA","GL"), sep = ":") %>% transform(.,allel_frequency=as.numeric(AO)/as.numeric(DP),QualityScore=as.numeric(QA)/as.numeric(DP))
##==================
##03_filter QA/DP > 30
##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus"
##==================
sample_relative_temp_filter <- sample_relative_temp[which(!is.na(sample_relative_temp$QualityScore)),]
# tmp <- sample_relative_temp_filter[which(sample_relative_temp_filter$sample=="di_K2_6h"),]
# nrow(tmp)
# table(tmp$sample)
# tmp$QualityScore <- as.numeric(tmp$QualityScore)
# sum(tmp$QualityScore>5)
# hist(tmp$QualityScore)
# hist(sample_relative_temp_filter$QualityScore)
sample_relative_temp_withONT <-sample_relative_temp_filter[which(sample_relative_temp_filter$QualityScore>5),]
table(sample_relative_temp_withONT$sample)
# sample_relative_temp_withONT <- sample_relative_temp
sample_relative_temp_withONT$site <- paste(sample_relative_temp_withONT$X.CHROM,sample_relative_temp_withONT$POS,sample_relative_temp_withONT$REF,sample_relative_temp_withONT$ALT,sep="_")
sample_relative_temp <- sample_relative_temp_withONT
# sample_relative_temp <- sample_relative_temp_filter ### without QS>30
##==================
##03_filter
##==================
##remove SNPs that have more than two allel frequencies
# nrow(sample_relative_temp)
# nas <- sample_relative_temp[which(is.na(sample_relative_temp$allel_frequency)),]
# nrow(nas)
sample_relative_temp_cleaned <- sample_relative_temp[which(!is.na(sample_relative_temp$allel_frequency)),]
# nrow(sample_relative_temp_cleaned)
##remove low coverage (>30x)
cutoff <- 30
sample_relative_temp_cleaned$DP <- as.numeric(sample_relative_temp_cleaned$DP)
nrow(sample_relative_temp_cleaned)
sample_relative_safe <- sample_relative_temp_cleaned[which(sample_relative_temp_cleaned$DP>cutoff),]
nrow(sample_relative_safe)
##make all allele frequencies smaller than 0.05 to NA
# sample_relative_safe[sample_relative_safe$allel_frequency<0.05,"allel_frequency"] <- NA
#is.na(sample_relative_safe$allel_frequency)
##remove snps with less than 2x alternative allel frequency or zero
sample_relative_safe$AO <- as.numeric(sample_relative_safe$AO)
# nrow(sample_relative_safe[sample_relative_safe$AO<=2,])
# nrow(sample_relative_safe[sample_relative_safe$AO==0 | sample_relative_safe$AO>2,])
nrow(sample_relative_safe[sample_relative_safe$AO>0 & sample_relative_safe$AO<=2,])
sample_relative_safe <- (sample_relative_safe[sample_relative_safe$AO==0 | sample_relative_safe$AO>2,])
nrow(sample_relative_safe)
##==================
##04_preperation and make wide
##==================
sample_relative_temp_cleaned <- sample_relative_safe
#ggplot(sample_relative_temp_cleaned, aes(y=DP))+geom_boxplot()+ylim(c(0,1000))
##make site name
sample_relative_temp_cleaned$site <- paste(sample_relative_temp_cleaned$X.CHROM,sample_relative_temp_cleaned$POS,sample_relative_temp_cleaned$REF,sample_relative_temp_cleaned$ALT,sep="_")
###spread the dataframe again
nrow(sample_relative_temp_cleaned)
sample_relative_safe_final_prep <- sample_relative_temp_cleaned[,c("X.CHROM","POS","site","allel_frequency","sample")]
#nrow(sample_relative_safe_final_prep[grep("S_thermophilus_RMK202",sample_relative_safe_final_prep$site),])
sample_relative_safe_final_tsne <- spread(sample_relative_safe_final_prep, sample, allel_frequency) %>%replace(is.na(.), 0)
nrow(sample_relative_safe_final_tsne)
table(sample_relative_safe_final_prep$sample)
##==================
##05_filter rowsums!=0
##==================
##remove row sums ==0
nrow(sample_relative_safe_final_tsne)
sum(rowSums(sample_relative_safe_final_tsne[,4:ncol(sample_relative_safe_final_tsne)],na.rm = TRUE)==0)
sample_relative_wide <- sample_relative_safe_final_tsne[rowSums(sample_relative_safe_final_tsne[,4:ncol(sample_relative_safe_final_tsne)],na.rm = TRUE)>0,]
table(sample_relative_wide$X.CHROM)
sample_relative_wide_phagesONly <- sample_relative_wide[grep("phage",sample_relative_wide$X.CHROM),]
# table(sample_relative_wide_phagesONly$X.CHROM)
# nrow(sample_relative_wide)
sample_relative_wide <- subset(sample_relative_wide, select=-POS)
sample_relative_wide$species <- sample_relative_wide$X.CHROM
sample_relative_wide$species <- revalue(sample_relative_wide$species, c("CP046131"="L. delbrueckii","CP046134"="S. thermophilus"))
sample_relative_wide$culture <- "RMK202"
nrow(sample_relative_wide)
##==================
##06_add gene information
##==================
RMK202_snpeff_eggnog_repeats_core <- read_csv("~/Desktop/Projects/2019_Pilotplan/04_mapping2ONT/SnpEff/new/RMK202_snpeff_eggnog_repeats_core.txt")
# nrow(RMK202_snpeff_eggnog_repeats_core)
# nrow(sample_relative_wide)
#RMK202_snpeff_eggnog_repeats_core_subset <- subset(RMK202_snpeff_eggnog_repeats_core, select=c(X.CHROM,site,finalName,effect,significance,geneName,GOs,EC,KEGG_ko,COG_Functional_Category,Repeat_identity,core))
sample_relative_wide_description <- merge(sample_relative_wide,RMK202_snpeff_eggnog_repeats_core, by = "site",all.x = TRUE)
# nrow(sample_relative_wide_description)
#---------------------------------subset for interesting columns
colnames(sample_relative_wide_description)
sample_relative_wide_description_interest <- subset(sample_relative_wide_description, select=c("X.CHROM.x","species.x","site","culture","finalName","Preferred_name","effect","significance","geneName","COG_Functional_Category","Repeat_cluster","core","L104","L108","L35","L44","L70","L71","L80","L99","S50","S72","Lyo_202_2012","Lyo_202_2014","Konserve_202","RMK202","Versand_202","lyo202_96","di_K2_6h","th_K2_8h","G1_6_18","G2_6_18","G3_6_18","G4_6_18","G5_6_18","12107","24776",
"24778",
"24779",
"24780",
"24781",
"24782",
"24783",
"24798",
"24777",
"13491",
"13492",
"13493",
"13494",
"13495",
"13496",
"13497",
"13498",
"13500",
"24737",
"24738",
"24739",
"24740",
"13499",
"24853",
"24854",
"24855"))
#"G4_6_18"
colnames(sample_relative_wide_description_interest)
nrow(sample_relative_wide_description_interest)
##==================
##07 rename samples
##==================
# colnames(sample_relative_wide_description_interest) <- revalue(colnames(sample_relative_wide_description_interest), c("species.x"="species","S50"="mst1","S72"="mst2","L70"="mst3", "L44"="mst10", "L108"="mst7", "L99"="mst8", "L104"="mst6", "L80"="mst5", "L35"="mst9", "L71"="mst4","lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
colnames(sample_relative_wide_description_interest) <- revalue(colnames(sample_relative_wide_description_interest), c("species.x"="species","L70"="mst3", "L44"="mst10", "L108"="mst7", "L99"="mst8", "L104"="mst6", "L80"="mst5", "L35"="mst9", "L71"="mst4","lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2","G1_6_18"="experiment_A","G2_6_18"="experiment_B","G3_6_18"="experiment_C","G4_6_18"="experiment_D","G5_6_18"="experiment_E"))
##==================
##08_remove non-bacterial SNPs
##==================
table(sample_relative_wide_description_interest$X.CHROM.x)
nrow(sample_relative_wide_description_interest)
# sample_relative_wide_description_interest_subset <- sample_relative_wide_description_interest[which(sample_relative_wide_description_interest$X.CHROM.x %in% c("L_delbrueckii_RMK202","S_thermophilus_RMK202")),]
sample_relative_wide_description_interest_subset <- sample_relative_wide_description_interest[which(sample_relative_wide_description_interest$X.CHROM.x %in% c("CP046131","CP046134")),]
nrow(sample_relative_wide_description_interest_subset)
table(sample_relative_wide_description_interest_subset$X.CHROM.x)
sample_relative_wide <- sample_relative_wide_description_interest_subset
##==================
##09_order variables properly
##==================
sample_relative_wide$significance <- factor(sample_relative_wide$significance, levels=c("MODIFIER","LOW","MODERATE","HIGH"))
##==================
##10_dN_dS ratio
##==================
###-----------prep for
prep_for_dN_dS_wide <- sample_relative_wide[which(sample_relative_wide$significance!="MODIFIER"),]
###-----------calculate dS/dN ratio
dN_dS_ratios <- data.frame()
for (gene in unique(prep_for_dN_dS_wide$finalName)) {
dS <- prep_for_dN_dS_wide %>% filter(finalName==gene) %>% filter(significance=="LOW")
dN <- prep_for_dN_dS_wide %>% filter(finalName==gene) %>% filter(significance %in% c("MODERATE","HIGH"))
tmp <- data.frame(gene, "dS"=nrow(dS),"dN"=nrow(dN),"dN_dN+dS"=(nrow(dN))/(nrow(dS)+nrow(dN)),"dN_dS"=(nrow(dN))/(nrow(dS)))
dN_dS_ratios <- rbind(dN_dS_ratios,tmp)
}
plotDens <- ggplot(dN_dS_ratios,aes(x=dN_dS))+geom_histogram()+
geom_vline(xintercept=mean(dN_dS_ratios$dN_dS))+
theme_classic()+
labs(x="dN/(dN+dS)",
y="gene count")
plotDens
dN_dS_ratios <- arrange(dN_dS_ratios,desc(dN_dS))
##-------------------------------prep snpeff
annotation_prep <- RMK202_snpeff_eggnog_repeats_core %>% select(-c(X1,X.CHROM,species,POS,REF,ALT,quality,site,effect,significance)) %>% distinct()
dN_dS_ratios_final <- merge(dN_dS_ratios,annotation_prep,by.x="gene",by.y="finalName",all.x = TRUE)
dN_dS_ratios_final_02 <- arrange(dN_dS_ratios_final,desc(dN_dS))
##-------------------------------wirte to file
# write.xlsx(dN_dS_ratios_final_02,file="Users//Desktop/presenations/my_presentations/20191007_groupmeeting/figures/dN_dS_ratios_genes_rmk202.xls", sheetName = "Sheet1",
# col.names = TRUE, row.names = FALSE, append = FALSE)
##-------------------------------merge with sample_relative_wide
dN_dS_ratios$mutations <- dN_dS_ratios$dS+dN_dS_ratios$dN
dN_dS_ratios_prep <- dN_dS_ratios %>% select(gene,dN_dN.dS,mutations)
sample_relative_wide <- merge(sample_relative_wide,dN_dS_ratios_prep,by.y="gene",by.x="finalName",all.x = TRUE)
# colnames(sample_relative_wide)
# RMK202_snpeff_eggnog_repeats_core_uniuqes <- sample_relative_wide %>%select(-c("X.CHROM.x","POS","REF","ALT","quality","site","effect","significance","best_tax_level")) %>% distinct()
##==================
##11_make long
##==================
colnames(sample_relative_wide)
sample_relative_long <- gather(sample_relative_wide, sample, Snps, "mst6":"24855", factor_key=TRUE,na.rm = TRUE)
nrow(sample_relative_long)
table(sample_relative_long$X.CHROM)
##==================
##12_make a unique gene infomoration
##==================
RMK202_snpeff_eggnog_repeats_core <- read_csv("~/Desktop/Projects/2019_Pilotplan/04_mapping2ONT/SnpEff/new/RMK202_snpeff_eggnog_repeats_core.txt")
RMK202_snpeff_eggnog_repeats_core_uniuqes <- RMK202_snpeff_eggnog_repeats_core %>% select(-c("X1","X.CHROM","POS","REF","ALT","quality","site","effect","significance","best_tax_level")) %>% distinct()
RMK202_snpeff_eggnog_repeats_core_uniuqes_final <- merge(RMK202_snpeff_eggnog_repeats_core_uniuqes,dN_dS_ratios_prep,by.y="gene",by.x="finalName",all.x = TRUE)
write.table(RMK202_snpeff_eggnog_repeats_core_uniuqes_final,quote=FALSE,sep="\t",file="~/Desktop/presenations/my_presentations/20191007_groupmeeting/figures/RMK202_snpeff_eggnog_repeats_core_dNdS_uniqueGenes.txt")
colnames(RMK202_snpeff_eggnog_repeats_core_uniuqes_final)
RMK202_snpeff_eggnog_repeats_core_uniuqes_final_reduce <- RMK202_snpeff_eggnog_repeats_core %>% select(-c("finalName","Preferred_name","EC","KEGG_ko","KEGG_Reaction","CAZy","BiGG_Reaction","COG_Functional_Category","eggNOG free text description")) %>% distinct()
###------------------------------------------
#wide only metagenomes
###------------------------------------------
table(sample_relative_long$sample)
sample_relative_long_meta <- sample_relative_long %>% filter(sample %in% c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))
sample_relative_wide_meta <- spread(sample_relative_long_meta, sample, Snps) %>% filter(core=="core")#%>%replace(is.na(.), 0) ##----------------
##stats
##----------------
table((sample_relative_wide_meta$core))
length((sample_relative_wide_meta$core))
table(sample_relative_wide_meta$species)
table(sample_relative_wide_meta$significance)
sample_relative_wide_meta$synonomous <- revalue(sample_relative_wide_meta$significance,c("MODIFIER"="synonymous","LOW"="synonymous","MODERATE"="non-synonymous","HIGH"="non-synonymous"))##add certain clusters to others
table(interaction(sample_relative_wide_meta$species,sample_relative_wide_meta$synonomous))
plotSNPs <- ggplot(sample_relative_wide_meta,aes(x=synonomous,fill=significance))+geom_bar()+theme_classic()+facet_wrap(~species,scales="free")+labs(x="",y="count",fill="SNP effect")+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))
plotSNPs
# svg("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance_bac_snps.svg",width=5,height=4.5)
png("~/Desktop/Manuscripts/2019_RMK202/Figures/supplement_SNP_effect.png", width = 1600, height = 1800,res=300)
plotSNPs
dev.off()
##============
### plot
##============
all_colours <- rev(c("#EB4D4D","#10B552") )
colnames(sample_relative_safe_woSTRAINS)
table(sample_relative_safe_woSTRAINS$sample)
##============
### subset :
##----01==only genic mutations
##----02==only core genes mutations
##----03==only snps above 0.03
##----04==only moderate or high significance SNVs
##============
table(sample_relative_long$sample)
nrow(sample_relative_long)
sample_relative_safe_woSTRAINS_sub01 <- sample_relative_long
# sample_relative_safe_woSTRAINS_sub01 <- sample_relative_safe_woSTRAINS[which(sample_relative_safe_woSTRAINS$significance!="MODIFIER"),]
nrow(sample_relative_safe_woSTRAINS_sub01)
sample_relative_safe_woSTRAINS_sub02 <- sample_relative_safe_woSTRAINS_sub01
sample_relative_safe_woSTRAINS_sub02 <- sample_relative_safe_woSTRAINS_sub01[which(sample_relative_safe_woSTRAINS_sub01$core=="core"),]
table(sample_relative_safe_woSTRAINS_sub02$sample)
sample_relative_safe_woSTRAINS_sub03 <- sample_relative_safe_woSTRAINS_sub02[which(sample_relative_safe_woSTRAINS_sub02$Snps>0.03),]
# sample_relative_safe_woSTRAINS_sub05 <- sample_relative_safe_woSTRAINS_sub03
sample_relative_safe_woSTRAINS_sub04 <- sample_relative_safe_woSTRAINS_sub02[which(sample_relative_safe_woSTRAINS_sub02$Snps<=0.03),]
sample_relative_safe_woSTRAINS_sub04$Snps <- 0
nrow(sample_relative_safe_woSTRAINS_sub04)
sample_relative_safe_woSTRAINS_sub05 <- rbind(sample_relative_safe_woSTRAINS_sub03,sample_relative_safe_woSTRAINS_sub04)
sample_relative_safe_woSTRAINS_sub06 <- sample_relative_safe_woSTRAINS_sub05[which(sample_relative_safe_woSTRAINS_sub05$significance %in% c("MODERATE","HIGH")),]
nrow(sample_relative_safe_woSTRAINS_sub05)
table(sample_relative_safe_woSTRAINS_sub05$sample)
sample_relative_safe_Sterm_final <- sample_relative_safe_woSTRAINS_sub06[which(sample_relative_safe_woSTRAINS_sub06$species=="S. thermophilus"),]
table(sample_relative_safe_Sterm_final$sample)
sample_relative_safe_Sterm_final_wide <- spread(sample_relative_safe_Sterm_final, sample, Snps) %>%replace(is.na(.), 0)
table(sample_relative_safe_Sterm_final_wide$sample)
# length(unique(sample_relative_safe_Sterm_final$site))
##============
### plot Streptococcus thermophius
##============
colnames(sample_relative_safe_Sterm_final)
table(sample_relative_safe_Sterm_final$sample)
# sample_relative_safe_woSTRAINS <- sample_relative_safe_Sterm_final[grep("^mst",sample_relative_safe_Sterm_final$sample,invert = TRUE),]
sample_relative_safe_Sterm_final$sample <- revalue(sample_relative_safe_Sterm_final$sample, c("cheesemaking\nday2"="Reference 2","cheesemaking\nday1"="Reference 1"))
# sample_relative_safe_woSTRAINS$sample <- revalue(sample_relative_safe_woSTRAINS$sample, c("Reference 2"="cheesemaking\nday2","Reference 1"="cheesemaking\nday1"))
table(sample_relative_safe_Sterm_final$sample)
sample_relative_safe_woSTRAINS <- sample_relative_safe_Sterm_final %>% filter(sample %in% c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2"))
sample_relative_safe_woSTRAINS$sample <- droplevels(sample_relative_safe_woSTRAINS$sample)
sample_relative_safe_woSTRAINS$sample = factor(sample_relative_safe_woSTRAINS$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2"))
levels(sample_relative_safe_woSTRAINS$sample)
table(sample_relative_safe_woSTRAINS$sample)
sample_relative_safe_woSTRAINS <- sample_relative_safe_woSTRAINS %>% filter(sample!="Reference 1") %>% filter(sample!="Reference 2")
p3 <- ggplot(sample_relative_safe_woSTRAINS,aes(x=sample,y=Snps,group=site,color=species,fill=species, text =paste("effect:", effect,"\nsignficance:",significance,"\ngeneName:",geneName,"\nRepeat_identity:",Repeat_cluster,"\ncore:",core,"\nCOG:",COG_Functional_Category)))+ geom_line(size=0.2, alpha=.1)+
# facet_grid(day~kessel~species)+
# facet_grid(species~.)+
# facet_grid(treatment~species)+
# facet_grid(treatment~species)+
labs("",
x="",
y="Alternative allele frequency")+
#scale_x_continuous(breaks =c(0,2,4,6,8,10,12,24),labels=c(0,2,4,6,8,10,12,24))+
#scale_y_continuous(limits = c(0,0.3))+
theme_classic()+
scale_fill_manual(values="#0081a7")+
scale_color_manual(values="#0081a7")+
# scale_color_manual(values=all_colours[2])+
scale_x_discrete( expand = c(0, 0)) +
theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
rect = element_rect(fill = "transparent"), # all rectangles #axis.text.x = element_blank(),
#axis.text.x = element_blank(),
legend.position="none"
#legend.justification=c(1,1), legend.position=c(1,1),
#legend.title = element_blank()
)
# svg("~/Desktop/presenations/my_presentations/20190919_VUA/abundance_snps.png", width = 1900, height = 1200,res=300)
# #
p3
#
# dev.off()
svg("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance_bac_snps_sterm.svg",width=6,height=5)
# png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1800, height = 1200,res=300)
#
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
p3
dev.off()
##============
### plot Lactobacillus delbrueckii
##============
nrow(sample_relative_long)
sample_relative_safe_woSTRAINS_sub01 <- sample_relative_long
# sample_relative_safe_woSTRAINS_sub01 <- sample_relative_safe_woSTRAINS[which(sample_relative_safe_woSTRAINS$significance!="MODIFIER"),]
nrow(sample_relative_safe_woSTRAINS_sub01)
table(sample_relative_safe_woSTRAINS_sub01$species)
sample_relative_safe_woSTRAINS_sub02 <- sample_relative_safe_woSTRAINS_sub01
sample_relative_safe_woSTRAINS_sub02 <- sample_relative_safe_woSTRAINS_sub01[which(sample_relative_safe_woSTRAINS_sub01$core=="core"),]
nrow(sample_relative_safe_woSTRAINS_sub02)
table(sample_relative_safe_woSTRAINS_sub02$species)
sample_relative_safe_woSTRAINS_sub03 <- sample_relative_safe_woSTRAINS_sub01[which(sample_relative_safe_woSTRAINS_sub01$Snps>0.03),]
# sample_relative_safe_woSTRAINS_sub05 <- sample_relative_safe_woSTRAINS_sub03
table(sample_relative_safe_woSTRAINS_sub03$species)
sample_relative_safe_woSTRAINS_sub04 <- sample_relative_safe_woSTRAINS_sub03 %>% filter(species=="L. delbrueckii")
unique(sample_relative_safe_woSTRAINS_sub04$site) %>% length()
sample_relative_safe_woSTRAINS_sub03 <- sample_relative_safe_woSTRAINS_sub02[which(sample_relative_safe_woSTRAINS_sub02$Snps<=0.03),]
sample_relative_safe_woSTRAINS_sub04$Snps <- 0
table(sample_relative_safe_woSTRAINS_sub04$species)
nrow(sample_relative_safe_woSTRAINS_sub04)
sample_relative_safe_woSTRAINS_sub05 <- rbind(sample_relative_safe_woSTRAINS_sub03,sample_relative_safe_woSTRAINS_sub04)
sample_relative_safe_woSTRAINS_sub05 <- sample_relative_safe_woSTRAINS_sub04
sample_relative_safe_woSTRAINS_sub06 <- sample_relative_safe_woSTRAINS_sub05[which(sample_relative_safe_woSTRAINS_sub05$significance %in% c("MODERATE","HIGH")),]
sample_relative_safe_woSTRAINS_sub06 <- sample_relative_safe_woSTRAINS_sub04
sample_relative_safe_woSTRAINS_sub06 <- sample_relative_long
table(sample_relative_safe_woSTRAINS_sub06$species)
sample_relative_safe_Sterm_final_ldel <- sample_relative_safe_woSTRAINS_sub06[which(sample_relative_safe_woSTRAINS_sub06$species!="S. thermophilus"),]
table(sample_relative_safe_Sterm_final_ldel$species)%>%replace(is.na(.), 0)
sample_relative_safe_Sterm_final_wide <- spread(sample_relative_safe_Sterm_final_ldel, sample, Snps)
sample_relative_safe_woSTRAINS
table(sample_relative_safe_woSTRAINS_sub06$species)
sample_relative_safe_woSTRAINS <- sample_relative_safe_Sterm_final[grep("^mst",sample_relative_safe_Sterm_final$sample,invert = TRUE),]
sample_relative_safe_woSTRAINS <- sample_relative_safe_woSTRAINS_sub06 %>% filter(sample %in% c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))
sample_relative_safe_woSTRAINS$sample <- droplevels(sample_relative_safe_woSTRAINS$sample)
sample_relative_safe_woSTRAINS$sample = factor(sample_relative_safe_woSTRAINS$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))
levels(sample_relative_safe_woSTRAINS$sample)
table(sample_relative_safe_woSTRAINS$sample)
sample_relative_safe_woSTRAINS <- sample_relative_safe_woSTRAINS %>% filter(sample!="cheesemaking\nday1") %>% filter(sample!="cheesemaking\nday2")
all_colours <- rev(c("#EB4D4D","#10B552") )
table(sample_relative_safe_woSTRAINS$species)
sample_relative_safe_woSTRAINS <- sample_relative_safe_woSTRAINS[which(sample_relative_safe_woSTRAINS$species!="S. thermophilus"),]
p3 <- ggplot(sample_relative_safe_woSTRAINS,aes(x=sample,y=Snps,group=site,color=species,fill=species, text =paste("effect:", effect,"\nsignficance:",significance,"\ngeneName:",geneName,"\nRepeat_identity:",Repeat_cluster,"\ncore:",core,"\nCOG:",COG_Functional_Category)))+ geom_line(size=0.5, alpha=0.5)+
# facet_grid(day~kessel~species)+
# facet_grid(species~.)+
# facet_grid(treatment~species)+
# facet_grid(treatment~species)+
labs("",
x="",
y="Alternative allele frequency")+
#scale_x_continuous(breaks =c(0,2,4,6,8,10,12,24),labels=c(0,2,4,6,8,10,12,24))+
#scale_y_continuous(limits = c(0,0.3))+
theme_classic()+
scale_fill_manual(values=all_colours)+
scale_color_manual(values=all_colours)+
scale_x_discrete( expand = c(0, 0)) +
theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
rect = element_rect(fill = "transparent"), # all rectangles #axis.text.x = element_blank(),
#axis.text.x = element_blank(),
legend.position="none"
#legend.justification=c(1,1), legend.position=c(1,1),
#legend.title = element_blank()
)
# svg("~/Desktop/presenations/my_presentations/20190919_VUA/abundance_snps.png", width = 1900, height = 1200,res=300)
# #
p3
svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/F1_relativeAbundance_bac_snps_Ldel.svg",width=6,height=5)
# png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1800, height = 1200,res=300)
#
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
p3
dev.off()1.3.2 Phylogeny
This is Kirstin’s approach based on orthofinder
Described in more detail here
species=Sterm
species=Ldel
for species in $(echo "Ldel Sterm")
do
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA
cp /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/FAA_all/* /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/
#cp /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FAA/* /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/
cp /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FAA/L-DSM-2007.faa /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/
echo "==========================================================="
ll /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/
done
###-----------------------------------------
##orthorfinder
###-----------------------------------------
species=Ldel
for species in $(echo "Ldel Sterm")
do
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}
#mkdir -p /data/Project/2020_StarterCultureDiversity/11_referenceTree/Orthofinder_combined_NCBI_own/${species}/
orthofinder -f /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/ \
-t 35 \
-o /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species} \
-a 35
donespecies=Ldel
for species in $(echo "Ldel Sterm")
do
#date=Results_Aug28/
#/data/Project/2020_StarterCultureDiversity/11_referenceTree/Orthofinder_combined_NCBI_own/${species}/Results_Jul28/
#for species in $(echo "Ldel Sterm")
#do
echo "==========================================="
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})
echo -e ${species}
perl /data/Project/2020_StarterCultureDiversity/99_log/aln_aa_to_dna/vincent/get_singlecp_orthologs.pl /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Orthogroups/Orthogroups.txt > /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Orthogroups/Orthogroups_SCOG.txt
wc -l /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Orthogroups/Orthogroups_SCOG.txt
wc -l /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Orthogroups/Orthogroups.txt
done
#doneHere, I make a directory containing the fna and faa files of all genomes with the file name corresponding to the locus tag.
for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})
#rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Sterm_references/aligned_combined_ncbi_own/FAA/
#mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Sterm_references/aligned_combined_ncbi_own/FAA/
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/
leterzzz=$(echo $species |head -c 1)
#cp /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FAA/* /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/combined_ncbi_own/FAA/${species}/
#cp /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/FAA_all/* /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/combined_ncbi_own/FAA/${species}/
#cp /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FFN/* /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/
cp /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FFN/L-DSM-2007.ffn /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/
cp /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/FFN_all/* /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/
done
###--------------------
for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/
for genomesss in $(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/ |grep ".faa$"|sed 's/.faa//g')
do
#locusTagsss=$(head -1 /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/${genomesss}.faa|cut -d '_' -f 1|sed 's/>//g')
locusTagsss=$(head -1 /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/${genomesss}.faa|cut -d ' ' -f 1| sed 's/_000.*$//g'|sed 's/>//g')
echo -e "This genome : " ${genomesss} " has the following locus Tag : "${locusTagsss}
cat /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA//${genomesss}.faa > \
/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA//${locusTagsss}.faa
cat /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN//${genomesss}.ffn > \
/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${locusTagsss}.ffn
done
done
##------=================================================================
#extract orthogroups
##------=================================================================
for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/
perl /data/Project/2020_StarterCultureDiversity/99_log/aln_aa_to_dna/vincent/extract_orthologs.pl /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Orthogroups/Orthogroups_SCOG.txt --folder ${species}
donealign protein families
for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}
for genesss in $(ls |grep ".faa$" |sed 's/.faa//g' )
do
rm /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}/${genesss}.aln.fasta
mafft --thread 37 --maxiterate 1000 --auto /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}/${genesss}.faa > /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}/${genesss}_aln.fasta
done
done #specieshere I back translate the faa alignment to dna.
for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}
perl /data/Project/2020_StarterCultureDiversity/99_log/aln_aa_to_dna/vincent/aln_aa_to_dna.pl
doneWith perl: prune alignments (removing columns represented by less than 50% of the sequences)
for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}
rm *_prune.fasta
perl /data/Project/2020_StarterCultureDiversity/99_log/aln_aa_to_dna/vincent/prune_aln.pl
done
sed -i 's/L-I-202-//g' /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}/*_aln_prune.fasta
##================================================================
#remove all infomration (except genome info) from multifasta header
##================================================================
for species in $(echo "Ldel Sterm"|tail -1)
do
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}
for genesss in $(ls |grep "_aln_prune.fasta$" |sed 's/_aln_prune.fasta//g' )
do
sed -i "s/_0.*//" ${genesss}_aln_prune.fasta
done
done
##================================================================
#shorten names
##================================================================
species=Sterm
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}
for genesss in $(ls |grep "_aln_prune.fasta$" |sed 's/_aln_prune.fasta//g' )
do
sed -i 's/202-13499c/202-13499/g' ${genesss}_aln_prune.fasta
doneWith perl: Concatenate pruned alignments
for species in $(echo "Ldel Sterm")
# for species in $(echo "Sterm")
do
echo -e ${species}
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})
rm /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/${species}.phylip
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}
perl /data/Project/2020_StarterCultureDiversity/99_log/aln_aa_to_dna/vincent/cat_align.pl > /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/${species}.phylip
doneWith RAxML: infer the phylogeny
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/
for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
echo -e ${species}
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own//${species}
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own//${species}
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own//${species}
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})
#rm -r *${species}_all*
/home/vincent/anaconda3/bin/raxmlHPC-PTHREADS-AVX2 -f a -x 12345 -p 12345 -# 100 -m GTRCAT -s /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/${species}.phylip -n ${species}_all -T 37
done
##---------------------
##change some names
##---------------------
sed 's/(S/(S-/g' /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/RAxML_bipartitions.Sterm_all |sed 's/,S/,S-/g'|sed 's/S--/S-/g' > /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/RAxML_bipartitions.Sterm_all_new
#/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/Ldel/RAxML_bipartitions.Ldel_allfind close samples
species=$(echo "Streptococcus thermophilus")
cat /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/log/20200728_NCBI_log.txt |grep -e "${species}" |awk -F "\t" 'BEGIN{OFS="\t"} {if ($12=="Complete Genome" ||$12=="Chromosome" ) print $0}'| grep "zlw" -imake pyhlip to aligned multifasta file
from Bio import SeqIO
records = SeqIO.parse("/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/Sterm/Results_Aug06/Sterm.phylip", "phylip")
count = SeqIO.write(records, "/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/Sterm/Results_Aug06/Sterm.fasta", "fasta")
print("Converted %i records" % count)
records = SeqIO.parse("/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/Ldel/Results_Aug06/Ldel.phylip", "phylip")
count = SeqIO.write(records, "/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/Ldel/Results_Aug06/Ldel.fasta", "fasta")
print("Converted %i records" % count)1.3.3 strain count
Here, I evaulate how abundant the S.thermophilus strains are
library(robustbase)
library(VennDiagram)
##==================================================Streptococcus thermophilus===================================================================
colnames(sample_relative_wide)
sample_relative_wide_snpsUsed <- sample_relative_wide %>%replace(is.na(.), 0) %>% filter(core=="core") %>% filter(species=="S. thermophilus") %>% filter( "Lyo\n1996">0.05| "Lyo\n2012">0.05| "Lyo\n2014">0.05| "working\nstock">0.05 | "starter\nculture\n2012">0.05 | "starter\nculture\n2018">0.05 | "experiment_A">0.05| "experiment_B">0.05| "experiment_C">0.05 | "experiment_D">0.05 | "experiment_E">0.05)
colnames(sample_relative_wide_snpsUsed)
sample_relative_wide_snpsUsed_lin1 <- sample_relative_wide_snpsUsed %>% filter(`24853`>0.8| `24798`>0.8| `13493`>0.8| `13500`>0.8 | `24737`>0.8 | `24854`>0.8| `13491`>0.8 | `13492`>0.8 ) %>% add_column(explained="lin1")
sample_relative_wide_snpsUsed_lin2 <- sample_relative_wide_snpsUsed %>% filter(`S72`>0.8| `24855`>0.8| `13494`>0.8)%>% add_column(explained="lin2")
sample_relative_wide_snpsUsed_lin3 <- sample_relative_wide_snpsUsed %>% filter(`S50`>0.8| `24740`>0.8| `24738`>0.8| `13499`>0.8 )%>% add_column(explained="lin3")
sample_relative_wide_snpsUsed_lin4 <- sample_relative_wide_snpsUsed %>% filter(`24739`>0.8| `13497`>0.8| `13496`>0.8| `13495`>0.8 )%>% add_column(explained="lin4")
allsites <- c(sample_relative_wide_snpsUsed_lin1$site, sample_relative_wide_snpsUsed_lin2$site, sample_relative_wide_snpsUsed_lin3$site,sample_relative_wide_snpsUsed_lin4$site)
##----------------------
# Chart venn diagramm
##----------------------
temp <-venn.diagram(
x = list(sample_relative_wide_snpsUsed_lin1$site, sample_relative_wide_snpsUsed_lin2$site, sample_relative_wide_snpsUsed_lin3$site,sample_relative_wide_snpsUsed_lin4$site),
category.names = c("lin 1" , "lin 2 " , "lin 3", "lin 4"),
filename = NULL
)
plot.new()
grid.draw(temp)
#
# pdf("testpdf", width = 14, height = 7)
#
# grid.draw(temp)
#
# dev.off()
grid.draw(temp)
allsites[duplicated(allsites)]
sum(duplicated(allsites))
sum(!duplicated(allsites))
sum(table(allsites)>1)
sum(table(allsites)==1)
duplictednames <- names(table(allsites)[(table(allsites)>1)])
##----------------------
# Chart venn diagramm
##----------------------
sample_relative_wide_snpsUsed_new_01 <- rbind(sample_relative_wide_snpsUsed_lin1,sample_relative_wide_snpsUsed_lin2,sample_relative_wide_snpsUsed_lin3,sample_relative_wide_snpsUsed_lin4)
sample_relative_wide_snpsUsed_new_02 <- sample_relative_wide_snpsUsed_new_01 %>% filter(!site %in%duplictednames) #lineage specific snps
table(sample_relative_wide_snpsUsed_new_02$explained)
sample_relative_wide_snpsUsed_new_temp <- sample_relative_wide_snpsUsed %>% filter(site %in%duplictednames)%>% add_column(explained="multiple") ##duplicated sites
sample_relative_wide_snpsUsed_new_03 <- rbind(sample_relative_wide_snpsUsed_new_02,sample_relative_wide_snpsUsed_new_temp) ##all explained sites
sample_relative_wide_snpsUsed_tmp <- sample_relative_wide_snpsUsed %>% filter(!site %in%sample_relative_wide_snpsUsed_new_03$site) %>% add_column(explained="not explained")
sample_relative_wide_snpsUsed_final <- rbind(sample_relative_wide_snpsUsed_new_03,sample_relative_wide_snpsUsed_tmp) ##all explained sites
##EXKURSION LINEAGE SPECIFIC DUPLICATIONS
TMP <- sample_relative_wide_snpsUsed %>% filter(site %in%duplictednames)
TMP$e
short_explained <- sample_relative_wide_snpsUsed_new_01 %>%filter(site %in%duplictednames) %>% select(c("site","species","explained")) %>% mutate(abundance = explained) %>% spread(., explained, abundance)
short_explained$explained <- paste(short_explained$lin1,short_explained$lin2,short_explained$lin3,short_explained$lin4,sep="_")
short_explained_multiple <- short_explained%>% select("site","explained")
sample_relative_wide_snpsUsed_new_temp <- sample_relative_wide_snpsUsed %>% filter(site %in%duplictednames) ##duplicated sites
sample_relative_wide_snpsUsed_new_temp_02 <- merge(sample_relative_wide_snpsUsed_new_temp,short_explained_multiple,by="site")
sample_relative_wide_snpsUsed_new_03 <- rbind(sample_relative_wide_snpsUsed_new_02,sample_relative_wide_snpsUsed_new_temp_02) ##all explained sites
sample_relative_wide_snpsUsed_tmp <- sample_relative_wide_snpsUsed %>% filter(!site %in%sample_relative_wide_snpsUsed_new_03$site) %>% add_column(explained="not explained")
sample_relative_wide_snpsUsed_final <- rbind(sample_relative_wide_snpsUsed_new_03,sample_relative_wide_snpsUsed_tmp) ##all explained sites
totalSNPS <- nrow(sample_relative_wide_snpsUsed_final)
100*(table(sample_relative_wide_snpsUsed_final$explained)/totalSNPS)
table(sample_relative_wide_snpsUsed_final$explained)
##----------------------
# boxplot
##----------------------
# sample_relative_wide_snpsUsed_final_long <- sample_relative_wide_snpsUsed_final %>% select( "site","Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","explained")%>% gather(.,sample,Median,"Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E")
sample_relative_wide_snpsUsed_final <- sample_relative_wide_snpsUsed_final %>% dplyr::rename("Reference 2"="cheesemaking\nday2","Reference 1"="cheesemaking\nday1" )
sample_relative_wide_snpsUsed_final_long <- sample_relative_wide_snpsUsed_final %>% select( "site","Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2","explained")%>% gather(.,sample,Median,"Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2")
# ggplot(sample_relative_wide_snpsUsed_final_long,aes(x=sample,y=Median))+geom_boxplot()+theme_classic()+facet_wrap(~explained)
#remove zeroes
sample_relative_wide_snpsUsed_final_long_woZeros <- sample_relative_wide_snpsUsed_final_long %>% filter(Median>0.1)
ggplot(sample_relative_wide_snpsUsed_final_long_woZeros,aes(x=sample,y=Median))+geom_boxplot()+theme_classic()+facet_wrap(~explained)
##----------------------
# lineplot
##----------------------
sample_relative_wide_snpsUsed_final_long_woZeros$sample = factor(sample_relative_wide_snpsUsed_final_long_woZeros$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2"))
sample_relative_wide_snpsUsed_final_long_woZeros$explained = factor(sample_relative_wide_snpsUsed_final_long_woZeros$explained, levels=c("lin1","lin2","lin3","lin4","not explained","lin1_lin2_NA_NA","NA_NA_lin3_lin4","lin1_lin2_lin3_lin4","lin1_lin2_NA_lin4","lin1_NA_lin3_lin4","lin1_NA_lin3_NA","lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4","NA_lin2_lin3_lin4"))
# ggplot(sample_relative_wide_snpsUsed_final_long_woZeros,aes(x=sample,y=Median,group=site))+geom_line(color="red",size=0.2, alpha=.1)+theme_classic()
sample_relative_wide_snpsUsed_final_long_woZeros <- sample_relative_wide_snpsUsed_final_long_woZeros %>% filter(explained!="not explained")
pExplainted <- ggplot(sample_relative_wide_snpsUsed_final_long_woZeros,aes(x=sample,y=Median,group=site,color=explained,fill=explained))+geom_line(alpha=.5,size=0.5)+theme_classic()+facet_wrap(~explained)+theme(legend.position = "none",axis.text.x = element_text(angle = 75, hjust = 1,size=9))
table(sample_relative_wide_snpsUsed_final_long_woZeros$explained)
pExplainted
svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/Supplement_SNVs_explained_by_isolates.svg",width=7,height=4.5)
# png("~/Desktop/Projects/2019_RMK202_analysis/plot/supp_altNucFre_lineages.png", width = 3400, height = 2600,res=300)
#
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
pExplainted
dev.off()
###--------------------
##subset for ppx
table(sample_relative_wide_snpsUsed_final_long_woZeros$explained)
# sample_relative_wide_snpsUsed_final_long_tmp <- sample_relative_wide_snpsUsed_final_long_woZeros %>% filter(explained %in% c("NA_NA_lin3_lin4","NA_lin2_lin3_lin4","NA_lin2_NA_lin4","lin3"))
sample_relative_wide_snpsUsed_final_long_tmp <- sample_relative_wide_snpsUsed_final_long_woZeros %>% filter(explained %in% c("lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4","lin1_NA_lin3_NA"))
sample_relative_wide_snpsUsed_final_long_tmp$explained = factor(sample_relative_wide_snpsUsed_final_long_tmp$explained, levels=c("lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4","lin1_NA_lin3_NA"))
# sample_relative_wide_snpsUsed_final_long$explained_02 <- revalue(sample_relative_wide_snpsUsed_final_long$explained, c("lin1"="explained by isolates", "lin2"="explained by isolates","lin3"="explained by isolates","lin4"="explained by isolates","multiple"="explained by isolates","not explained"="not explained by isolates"))
pExplainted <- ggplot(sample_relative_wide_snpsUsed_final_long_tmp,aes(x=sample,y=Median,group=site,color=explained,fill=explained))+geom_line(alpha=.5,size=0.5)+theme_classic()+facet_wrap(~explained,ncol=4)+theme(legend.position = "none",axis.text.x = element_text(angle = 75, hjust = 1,size=6))
pExplainted
png("~/Desktop/Projects/2019_RMK202_analysis/plot/supp_altNucFre_lineages_03.png", width = 3400, height = 1000,res=300)
#
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
pExplainted
dev.off()
###--------------------
sample_relative_wide_snpsUsed_final_long$explained_02 <- revalue(sample_relative_wide_snpsUsed_final_long$explained, c("lin1"="explained by isolates", "lin2"="explained by isolates","lin3"="explained by isolates","lin4"="explained by isolates","multiple"="explained by isolates","not explained"="not explained by isolates"))
pExplainted <- ggplot(sample_relative_wide_snpsUsed_final_long,aes(x=sample,y=Median,group=site,color=explained_02,fill=explained_02))+geom_line(size=0.5,alpha=0.5)+theme_classic()+facet_wrap(~explained_02)+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),legend.position = "none")+
labs("",
x="",
y="Alternative allele frequency")
pExplainted
pExplainted <- ggplot(sample_relative_wide_snpsUsed_final_long,aes(x=sample,y=Median,group=site,color=explained_02,fill=explained_02))+geom_line(size=0.5,alpha=0.5)+theme_classic()+facet_wrap(~explained_02)+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),legend.position = "none")+
labs("",
x="",
y="Alternative allele frequency")
pExplainted
svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/Supplement_SNVs_explained_by_isolates..svg",width=7,height=4.5)
# # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
#
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
pExplainted
dev.off()
##----------------------
# not explained
##----------------------
sample_relative_wide_snpsUsed_final_long$explained_03 <- ifelse(sample_relative_wide_snpsUsed_final_long$explained_02=="not explained by isolates","not explained by isolates","explained")
sample_relative_wide_snpsUsed_final_long$sample = factor(sample_relative_wide_snpsUsed_final_long$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))
pExplainted_rough <- ggplot(sample_relative_wide_snpsUsed_final_long,aes(x=sample,y=Median,group=site,color=explained_03,fill=explained_03))+geom_line(size=0.5,alpha=0.5)+theme_classic()+facet_wrap(~explained_03)+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),legend.position = "none")+
labs("",
x="",
y="Alternative allele frequency")
pExplainted_rough
svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/Supplement_SNVs_notExplained.svg",width=7,height=4.5)
# # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
#
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
pExplainted_rough
dev.off()
##----------------------
# where do these funky linkes (~recombinatnts) locate on the genome
##----------------------
sample_relative_wide_snpsUsed_final_long_recombinants <- sample_relative_wide_snpsUsed_final_long %>% filter(explained %in% c("lin1_lin2_NA_lin4","lin1_NA_lin3_lin4","lin1_NA_lin3_NA","lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4","NA_lin2_lin3_lin4")) %>% select("site","explained") %>% unique()
sample_relative_wide_snpsUsed_final_long_recombinants <- sample_relative_wide_snpsUsed_final_long %>% select("site","explained") %>% unique()
sample_relative_wide_snpsUsed_final_long_recombinants$location <- as.numeric(str_split_fixed(sample_relative_wide_snpsUsed_final_long_recombinants$site, "_", 4)[,2])
sample_relative_wide_snpsUsed_final_long_recombinants$explained = factor(sample_relative_wide_snpsUsed_final_long_recombinants$explained, levels=c("lin1","lin2","lin3","lin4","not explained","lin1_lin2_NA_NA","NA_NA_lin3_lin4","lin1_lin2_lin3_lin4","NA_lin2_lin3_lin4","lin1_lin2_NA_lin4","lin1_NA_lin3_lin4","lin1_NA_lin3_NA","lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4"))
sample_relative_wide_snpsUsed_final_long_recombinants$explained_2 <- revalue(sample_relative_wide_snpsUsed_final_long_recombinants$explained, c("lin1_lin2_NA_NA"="lin1_lin2_NA_NA=~evolved_after_split","NA_NA_lin3_lin4"="NA_NA_lin3_lin4=~evolved_after_split","lin1_lin2_lin3_lin4"="lin1_lin2_lin3_lin4=polishingERRORs","NA_lin2_lin3_lin4"="NA_lin2_lin3_lin4=lost_in_lin1","lin1_NA_lin3_lin4"="lin1_NA_lin3_lin4=lost_in_lin2","lin1_lin2_NA_lin4"="lin1_lin2_NA_lin4=lost_in_lin3","lin1_NA_lin3_NA"="lin1_NA_lin3_NA=recombination","lin1_NA_NA_lin4"="lin1_NA_NA_lin4=recombination","NA_lin2_lin3_NA"="NA_lin2_lin3_NA=recombination","NA_lin2_NA_lin4"="NA_lin2_NA_lin4=recombination"))
# ggplot(sample_relative_wide_snpsUsed_final_long_recombinants,aes(x=location,color=explained,fill=explained))+geom_density()+facet_wrap(~explained,scales = "free_y")+theme_classic()
plocation <- ggplot(sample_relative_wide_snpsUsed_final_long_recombinants,aes(x=location,color=explained,fill=explained))+geom_histogram()+facet_wrap(~explained_2,scales = "free_y")+theme_classic()+labs(title="location of SNVs coming from different ",x="genomic location")+theme(legend.position = "none")
plocation
# svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/Supplement_SNVs_explained_by_isolates..svg",width=7,height=4.5)
png("~/Desktop/supp_altNucFre_lineages_location.png", width = 3400, height = 2600,res=300)
#
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
plocation
dev.off()
##----------------------
# not explained
##----------------------
table(sample_relative_wide_snpsUsed_final_long$explained)
sample_relative_wide_snpsUsed_final_long_notExplained <- sample_relative_wide_snpsUsed_final_long %>% filter(explained=="not explained")
ggplot(sample_relative_wide_snpsUsed_final_long_notExplained,aes(x=Median))+geom_density()+facet_wrap(~sample)+theme_classic()
###nomany multiallelic sites
sitesss <- str_split_fixed(sample_relative_wide_snpsUsed_final_long_notExplained$site, "_", 4)[,2]
length(sitesss)/6
unique(sitesss) %>% length()
sitesss <- str_split_fixed(sample_relative_wide_snpsUsed_final_long$site, "_", 4)[,2]
length(sitesss) /6
unique(sitesss) %>% length()
##----------------------
# meansss and medianss
##----------------------
library(patchwork)
# mediannssss <- aggregate(.~sample+explained, data=sample_relative_wide_snpsUsed_final_long[,c("explained","sample","Median")], median, na.rm=TRUE)%>% filter(explained!="multiple")
# meanssss <- aggregate(.~sample+explained, data=sample_relative_wide_snpsUsed_final_long[,c("explained","sample","Median")], median, na.rm=TRUE) %>% filter(explained!="multiple")
#
# mediansplot <- ggplot(mediannssss,aes(x=sample,y=Median,color=explained,fill=explained))+geom_bar(stat="identity")+theme_classic()+theme(legend.position = "none")
# meansssplot <- ggplot(meanssss,aes(x=sample,y=Median,color=explained,fill=explained))+geom_bar(stat="identity")+theme_classic()
#
# mediansplot+meansssplot
##---without zeros
# table(sample_relative_wide_snpsUsed_final_long_woZeros$sample)
mediannssss <- aggregate(.~sample+explained, data=sample_relative_wide_snpsUsed_final_long_woZeros[,c("explained","sample","Median")], median, na.rm=TRUE)%>% filter(explained!="multiple")
meanssss <- aggregate(.~sample+explained, data=sample_relative_wide_snpsUsed_final_long_woZeros[,c("explained","sample","Median")], median, na.rm=TRUE) %>% filter(explained!="multiple")
mediansplot <- ggplot(mediannssss,aes(x=sample,y=Median,color=explained,fill=explained))+geom_bar(stat="identity")+theme_classic()+theme(legend.position = "none")
meansssplot <- ggplot(meanssss,aes(x=sample,y=Median,color=explained,fill=explained))+geom_bar(stat="identity")+theme_classic()
mediansplot+meansssplot
##---------------------------------
##lineage abundance
##---------------------------------
ratio_larger <- mediannssss %>% filter(explained=="NA_NA_lin3_lin4") %>% select("sample","Median") %>% dplyr::rename(Overall_ratio = Median)
ratio_larger_2 <- mediannssss %>% filter(explained=="lin2") %>% select("sample","Median") %>% dplyr::rename(rel_lin2 = Median)
ratio_larger_3 <- mediannssss %>% filter(explained=="lin3") %>% select("sample","Median") %>% dplyr::rename(rel_lin3 = Median)
# ratio_larger_4 <- mediannssss %>% filter(explained=="lin4") %>% select("sample","Median") %>% dplyr::rename(rel_lin4 = Median)
ratio_together_1 <- merge(ratio_larger,ratio_larger_2,by="sample",all.x = TRUE)
ratio_together_2 <- merge(ratio_together_1,ratio_larger_3,by="sample",all.x = TRUE)
ratio_together_2$lin1 <- (1-ratio_together_2$Overall_ratio)*(1-ratio_together_2$rel_lin2)
ratio_together_2$lin2 <- (1-ratio_together_2$Overall_ratio)*(ratio_together_2$rel_lin2)
ratio_together_2$lin3 <- (ratio_together_2$Overall_ratio)*(ratio_together_2$rel_lin3)
ratio_together_2$lin4 <- (ratio_together_2$Overall_ratio)*(1-ratio_together_2$rel_lin3)
ratio_larger_4 <- mediannssss %>% filter(explained=="lin3") %>% select("sample","Median") %>% dplyr::rename(rel_lin3 = Median)
##ratios in Reference sequences
#ther is no lin3 or lin4 specific hits that is why we cannot calculate it like before.
#lin1 is the reference everything that is not lin1 is lin2.
REffs2 <- mediannssss %>% filter(sample=="Reference 1") %>% filter(explained=="lin2") %>% select(Median) %>% unlist() %>% as.numeric()
ratio_together_final_referenceSamples <- data.frame(sample="Reference 1",lin4=0,lin3=0,lin2=REffs2,lin1=1-REffs2)
REffs2 <- mediannssss %>% filter(sample=="Reference 2") %>% filter(explained=="lin2") %>% select(Median) %>% unlist() %>% as.numeric()
ratio_together_final_referenceSamples2 <- data.frame(sample="Reference 2",lin4=0,lin3=0,lin2=REffs2,lin1=1-REffs2)
# ratio_together_final_referenceSamples <-
# ratio_together_2$Unknown <- 1-(ratio_together_2$lin1+ratio_together_2$lin2+ratio_together_2$lin3+ratio_together_2$lin4)
# ratio_together_final <- ratio_together_2 %>% select(sample,lin1,lin2,lin3,lin4) %>% gather(.,lineage,"Relative abundance","lin1","lin2","lin3","lin4")
ratio_together_final_tmp1 <- ratio_together_2 %>% select(sample,lin4,lin3,lin2,lin1)
ratio_together_final <- rbind(ratio_together_final_tmp1,ratio_together_final_referenceSamples,ratio_together_final_referenceSamples2) %>% gather(.,lineage,"Relative abundance","lin4","lin3","lin2","lin1")
table(ratio_together_final$sample)
ratio_together_final$sample = factor(ratio_together_final$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2"))
ratio_together_final$lineage = factor(ratio_together_final$lineage, levels=(c("lin4","lin3","lin2","lin1")))
# colorsss <- c("#0000FF","#6699FF","#99CCFF","#00FFFF")
colorsss <- c("#99CCFF","#00FFFF","#0000FF","#6699FF")
plot_strain_abundance <- ggplot(ratio_together_final,aes(x=sample,y=`Relative abundance`,color=lineage,fill=lineage))+geom_bar(stat="identity")+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+
labs(x="")+scale_color_manual(values=colorsss)+scale_fill_manual(values=colorsss)+theme(legend.position = "none")
plot_strain_abundance
# png("~/Desktop/supp_strain_Abundance.png", width = 2800, height = 2200,res=300)
svg("~/Desktop/Projects/2019_RMK202_analysis/plot/strain_rel_abundance.svg",width=6,height=3.5)
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
plot_strain_abundance
dev.off()1.4 Figure 4
Figure 4. Phenotypic properties of individual strains, pairwise combination of strains, and original starter culture. A) Colony forming units (CFUs) of S. thermophilus and L. delbrueckii over 18h of growth when cultured alone, in pairwise combinations, or in the original starter cultures (RMK). The ribbons illustrate the interquartile range and the lines the modeled growth curves. (* indicates t-test, p-value<0.001) B) Acidification curves of the same samples. The ribbons illustrate the min and max pH of the different samples. C) Principal component analysis of the metabolic profiles after 24 h of growth at 37 °C. Different treatments are highlighted in colors and with the surrounding eclipse.
1.4.1 Acidification
Here, I analysis the acidification and growth rates measured on the 13.7.2020 I measured platte one which was pippetted with the robo, first ph 4.6 than milk than +culture (total 175ul)
###-----------------------------
##variables
###-----------------------------
location="../data_zenodo/non_genomic_data//acidifciation_20200714/"
plateName="20200712_rmk202_strains_curated_02.txt"
NamesWell="200714_names.csv"
SampleNamesWell="200714_samples_names.csv"
sampleName="20200712_rmk202_strains"
# replicate=03
# plateNumber=1
CALIBRATION_PH4_5=6000 #UNTILL WHICH SECOND IS PH 4.5
CALIBRATION_start_PH6_5=13000 #UNTILL WHICH SECOND IS PH 4.5
CALIBRATION_PH6_5=16000 #UNTILL WHICH SECOND IS PH 6.5
MEASURMENT_START=14000 #UNTILL WHICH SECOND IS THE MEASURMENT START
###-----------------------------
##contaminated rows
##this has to be added after the first run. see if large outliers or fermented blanks are occuring
###-----------------------------
exclude <- c("B5","A12","A8","A4")
###-----------------------------
##import
###-----------------------------
# X20200616_test_hydro_final_02 <- read_delim("~/Desktop/Projects/2020_StarterCultureDiversity/02_ph_measurment/hydroplates_measurment/platte1/200701_plate_01_rmk_r01_final_copypaste_02.txt", "\t", escape_double = FALSE, col_types = cols(Time = col_time(format = "%d.%H:%M:%S")), trim_ws = TRUE) %>%dplyr::select(-X99)
X20200616_test_hydro_final_02 <- read_delim(paste0(location,plateName), "\t", escape_double = FALSE, trim_ws = TRUE) %>% dplyr::select(-"X99")
colnames(X20200616_test_hydro_final_02)
###-----------------------------
##prep date
#unfortunately the molecular devices machine gives a weird date format (after 24h it adds 1. to the hour column)
#in order to correct it we have to run also the previous bash chunk
###-----------------------------
X20200616_test_hydro_final_02$days <- as.numeric(str_split_fixed(X20200616_test_hydro_final_02$`0.Time`, fixed("."), 2)[,1])
X20200616_test_hydro_final_02$time <- str_split_fixed(X20200616_test_hydro_final_02$`0.Time`, fixed("."), 2)[,2]
X20200616_test_hydro_final_02$hourOld <- as.numeric(str_split_fixed(X20200616_test_hydro_final_02$time, fixed(":"), 3)[,1])
X20200616_test_hydro_final_02$min <- as.numeric(str_split_fixed(X20200616_test_hydro_final_02$time, fixed(":"), 3)[,2])
# X20200616_test_hydro_final_02$sec <- str_split_fixed(X20200616_test_hydro_final_02$time, fixed(":"), 3)[,3]
# X20200616_test_hydro_final_02[,90:103]
# X20200616_test_hydro_final_02$hourNew <- as.character(X20200616_test_hydro_final_02$hourOld+(24*X20200616_test_hydro_final_02$days))
X20200616_test_hydro_final_03 <- X20200616_test_hydro_final_02 %>%
mutate(
days = duration(days, 'day'),
hourOld = duration(hourOld, 'hour'),
min = duration(min, 'minute'),
# sec = duration(sec, 'second'),
TIMEfinal = hourOld + days + min
) %>% dplyr::select(-c("days","time","hourOld",`0.Time`,`Temperature(¡C)`,"min"))
hydroplate_wide_prep <- X20200616_test_hydro_final_03
plotForCalibration <-ggplot(hydroplate_wide_prep,aes(x=TIMEfinal,y=A1))+geom_point()+theme_classic()+geom_vline(xintercept = c(CALIBRATION_PH4_5,CALIBRATION_PH6_5,MEASURMENT_START,CALIBRATION_start_PH6_5))
plotForCalibration
ggp <- ggplotly(plotForCalibration)
ggp
###-----------------------------
##pH_calibration
###-----------------------------
hydroplate_wide_ph_6.4 <- X20200616_test_hydro_final_03 %>% filter(TIMEfinal >= (paste0(CALIBRATION_start_PH6_5,"s")) & TIMEfinal <= (paste0(CALIBRATION_PH6_5,"s"))) %>% dplyr::select(-"TIMEfinal") %>% colMeans(na.rm = TRUE) %>% as.data.frame() %>% rownames_to_column(var="well")
colnames(hydroplate_wide_ph_6.4)[2] <- "pH_6.4"
hydroplate_wide_ph_4_66 <- X20200616_test_hydro_final_03 %>% filter(TIMEfinal < (paste0(CALIBRATION_PH4_5,"s"))) %>% dplyr::select(-"TIMEfinal") %>% colMeans(na.rm = TRUE) %>% as.data.frame() %>% rownames_to_column(var="well")
# hydroplate_wide_ph_4_66 <- tail(X20200616_test_hydro_final_03,n = 10) %>% dplyr::select(-"TIMEfinal") %>% colMeans() %>% as.data.frame() %>% rownames_to_column(var="well")
colnames(hydroplate_wide_ph_4_66)[2] <- "pH_4.66"
# hydroplate_wide[90,]
hydroplate_wide_ph_calibration <- merge(hydroplate_wide_ph_6.4,hydroplate_wide_ph_4_66,by="well")
hydroplate_wide_ph_calibration_curve <- hydroplate_wide_ph_calibration
###---------------calculate ph_regression
ph_high <- as.numeric(6.5)
ph_low <- as.numeric(4.6)
# hydroplate_wide_ph_calibration_curve$slope
# hydroplate_wide_ph_calibration_curve$slope <- ((hydroplate_wide_ph_calibration_curve$pH_6.4- hydroplate_wide_ph_calibration_curve$pH_4.66)/(ph_high-ph_low))
hydroplate_wide_ph_calibration_curve$slope <- ((ph_high-ph_low)/(as.integer(hydroplate_wide_ph_calibration_curve$pH_6.4)- as.integer(hydroplate_wide_ph_calibration_curve$pH_4.66)))
# hydroplate_wide_ph_calibration_curve$intersect <- hydroplate_wide_ph_calibration_curve$pH_6.4/(hydroplate_wide_ph_calibration_curve$slope*ph_high)
hydroplate_wide_ph_calibration_curve$intersect <- ph_high-(hydroplate_wide_ph_calibration_curve$slope*as.integer(hydroplate_wide_ph_calibration_curve$pH_6.4))
# hydroplate_wide_ph_calibration_curve$intersect <- ph_low/(hydroplate_wide_ph_calibration_curve$slope*hydroplate_wide_ph_calibration_curve$pH_4.66)
# hydroplate_wide_ph_calibration_curve$test <- (as.integer(hydroplate_wide_ph_calibration_curve$pH_6.4)*hydroplate_wide_ph_calibration_curve$slope)+hydroplate_wide_ph_calibration_curve$intersect
# test_intenstiy <- 4045145
# hydroplate_wide_ph_calibration_curve$slope*test_intenstiy+hydroplate_wide_ph_calibration_curve$intersect
hydroplate_calbration_test <- hydroplate_wide_ph_calibration_curve %>% dplyr::select(well,slope,intersect)
###-----------------------------
##merge samples
###-----------------------------
hydroplate_long <- gather(hydroplate_wide_prep, sample, pH, colnames(hydroplate_wide_prep)[1:96], factor_key=TRUE,na.rm = TRUE)
hydroplate_wide_ph_calibration_02 <- merge(hydroplate_long,hydroplate_calbration_test,by.x = "sample",by.y = "well")
hydroplate_wide_ph_calibration_02$intensity_calibrated <- hydroplate_wide_ph_calibration_02$slope*hydroplate_wide_ph_calibration_02$pH+hydroplate_wide_ph_calibration_02$intersect
# ggplot(hydroplate_wide_ph_calibration_02,aes(x=TIMEfinal,y=intensity_calibrated,group=sample,fill=sample,color=sample))+geom_point()+theme_classic()
hydroplate_wide_ph_calibration_03 <-hydroplate_wide_ph_calibration_02
# hydroplate_wide_ph_calibration_03 <- hydroplate_wide_ph_calibration_02 %>% filter(TIMEfinal >= ('32000s'))
# hydroplate_wide_ph_calibration_03 <- hydroplate_wide_ph_calibration_02 %>% filter(TIMEfinal >= ('32000s')&TIMEfinal < ('150'))
# hydroplate_wide_ph_calibration_03 <- hydroplate_wide_ph_calibration_02 %>% filter(TIMEfinal < ('40000s'))
ggplot(hydroplate_wide_ph_calibration_03,aes(x=TIMEfinal,y=intensity_calibrated,group=sample,fill=sample,color=sample))+geom_point()+theme_classic()
###-----------------------------
##add names
###-----------------------------
hydroplates_names <- read_delim(paste0(location,NamesWell), "\t", escape_double = FALSE, trim_ws = TRUE)
hydroplates_names_samples <- read_delim(paste0(location,SampleNamesWell), "\t", escape_double = FALSE, trim_ws = TRUE)
hydroplates_names_final <- merge(hydroplates_names,hydroplates_names_samples,by="SAMPLE",all = TRUE)
hydroplate_wide_ph_calibration_04 <- merge(hydroplate_wide_ph_calibration_03,hydroplates_names_final,by.x = "sample",by.y = "well") %>% filter(SAMPLE!="blank")
# table(hydroplate_wide_ph_calibration_04$no_growth)
table(hydroplate_wide_ph_calibration_04$SAMPLE)
table(hydroplate_wide_ph_calibration_04$sample)
###-----------------------------
##exclude if necessary
###-----------------------------
hydroplate_wide_ph_calibration_05 <- hydroplate_wide_ph_calibration_04 %>% filter(!sample %in% exclude)
htmlPrep <- ggplot(hydroplate_wide_ph_calibration_05,aes(x=TIMEfinal,y=intensity_calibrated,group=sample,fill=SAMPLE,color=SAMPLE))+geom_point()+theme_classic()+facet_wrap(~SAMPLE)+labs(y="pH",x="")+geom_vline(xintercept = c(CALIBRATION_PH4_5,CALIBRATION_PH6_5,MEASURMENT_START))
# htmlPrep <- ggplot(hydroplate_wide_ph_calibration_05,aes(x=TIMEfinal,y=intensity_calibrated,group=sample,fill=sample,color=sample))+geom_point()+theme_classic()+labs(y="pH",x="")
# htmlPrep
ggp <- ggplotly(htmlPrep)
ggp
htmlwidgets::saveWidget(ggp, paste0(location,sampleName,"_cleaned.html"))
# hydroplate_wide_ph_calibration_05 <- hydroplate_wide_ph_calibration_05 %>% filter(TIMEfinal > (paste0(MEASURMENT_START,"s")))
hydroplate_wide_ph_calibration_05 <- hydroplate_wide_ph_calibration_05 %>% filter(TIMEfinal > (paste0(CALIBRATION_start_PH6_5,"s")))
##-----------------------------------------------------------
#modelling
##-----------------------------------------------------------
Sys.sleep(10)
p <- c(y0 = 6.608016, mumax = 0.0003549404, K = 4.703572,h0=11.044679)
#
# lower <- c(y0 = 5, mumax = 0.5, K = 3.5, h0 = 1)
# upper <- c(y0 = 8, mumax = 2.5, K = 7, h0 = 10)
hydroplate_wide_ph_calibration_05$TIME <- as.numeric(hydroplate_wide_ph_calibration_05$TIMEfinal)
many_baranyi_sub <- all_growthmodels(
intensity_calibrated ~ grow_baranyi(TIMEfinal, parms) | sample+NAME+grouping,
data = hydroplate_wide_ph_calibration_05, p=p,ncores = 8 )
# results(many_baranyi_sub)
par(mfrow = c(12, 8))
par(mar = c(1, 1, 1, 1))
plot(many_baranyi_sub)
many_baranyi2_res <- results(many_baranyi_sub)
many_baranyi2_res$lagPhase <- many_baranyi2_res$h0/log(2)
many_baranyi2_res
many_baranyi2_res$lagPhase <- many_baranyi2_res$h0/many_baranyi2_res$mumax
many_baranyi2_res_preped <- many_baranyi2_res
# many_baranyi2_res_preped <- many_baranyi2_res %>% filter(name!="blank")
# many_baranyi2_res_preped <- many_baranyi2_res %>% filter(r2>0.98) %>% filter(name!="blank")
many_baranyi2_res_preped$mumax <- -many_baranyi2_res_preped$mumax
summary(many_baranyi2_res_preped)
table(many_baranyi2_res_preped$NAME)
many_baranyi2_res_preped$grouping = factor(many_baranyi2_res_preped$grouping, levels=c("lactobacillus","streptococcus","pairwise","complex"))
mumax_plot <- ggplot(many_baranyi2_res_preped,aes(x=NAME,y=mumax))+geom_boxplot()+theme_classic()+labs(x="",y="maximum pH decrease")+facet_wrap(~grouping,ncol = 5,scales = "free_x")
# ggplot(many_baranyi2_res_preped,aes(x=name,y=y0))+geom_boxplot()+theme_classic()+labs(x="","intitial pH")
k0plot <- ggplot(many_baranyi2_res_preped,aes(x=NAME,y=K))+geom_boxplot()+theme_classic()+labs(x="",y="lowest pH")+facet_wrap(~grouping,ncol = 5,scales = "free_x")
# ggplot(many_baranyi2_res_preped,aes(x=name,y=h0))+geom_boxplot()+theme_classic()
###-------------------------------------------------
##plot
###-------------------------------------------------
png(paste0(location,sampleName,"_all_baranyi.png"),width=4000,height=2000,res=300)
# svg("~/Desktop/Projects/2020_strainDelineation/04_pH_measurments/191127_pH_PLOT.svg",width=4,height=3)
par(mfrow = c(12, 8))
par(mar = c(1, 1, 1, 1))
plot(many_baranyi_sub)
dev.off()
finalSamplesPlot <- ggplot(hydroplate_wide_ph_calibration_05,aes(x=TIMEfinal,y=intensity_calibrated,group=NAME,fill=NAME,color=NAME))+geom_point()+theme_classic()+facet_wrap(~NAME)+labs(y="pH",x="")+theme(legend.position = "none")
svg(paste0(location,sampleName,"_all_samples_pH.svg"),width=12,height=8)
finalSamplesPlot
dev.off()
svg(paste0(location,sampleName,"_all_boxplots_summary.svg"),width=4,height=4)
mumax_plot + k0plot+plot_layout(nrow = 2)
dev.off()
mumax_plot + k0plot+plot_layout(nrow = 2)
###-------------------------------------------------
##output data
###-------------------------------------------------
many_baranyi2_res_preped$plate <- as.character(plateNumber)
many_baranyi2_res_preped$replicate <- as.character(replicate)
write.table(many_baranyi2_res_preped,paste0(location,sampleName,"_model.txt"),na = "", quote = FALSE, sep = "\t",row.names = FALSE, col.names = TRUE)
###-------------------------------------------------
##lagPHase phased
##h0 parameter specifying the initial physiological state of organisms (e.g. cells) and in consequence the lag phase (h0 = max growth rate * lag phase).
###-------------------------------------------------
str(many_baranyi2_res)
library(tidyverse)
many_baranyi2_res$sample
preped <- many_baranyi2_res %>% dplyr::select(c("sample","lagPhase")) %>% remove_rownames()
hydroplate_wide_ph_calibration_06 <- merge(hydroplate_wide_ph_calibration_05,preped,by="sample")
hydroplate_wide_ph_calibration_06 <- hydroplate_wide_ph_calibration_06 %>%
mutate(lagSeconds = duration(lagPhase, 'second')) #%>% dplyr::select(-c("days","time","hourOld",`0.Time`,`Temperature(¡C)`,"min"))
hydroplate_wide_ph_calibration_06$curatedTime <- hydroplate_wide_ph_calibration_06$TIMEfinal-hydroplate_wide_ph_calibration_06$lagSeconds
hydroplate_wide_ph_calibration_06$curatedTime <- ifelse(hydroplate_wide_ph_calibration_06$grouping=="lactobacillus",hydroplate_wide_ph_calibration_06$TIMEfinal-(0.75*hydroplate_wide_ph_calibration_06$lagSeconds),hydroplate_wide_ph_calibration_06$TIMEfinal-hydroplate_wide_ph_calibration_06$lagSeconds)
htmlPrep <- ggplot(hydroplate_wide_ph_calibration_06,aes(x=curatedTime,y=intensity_calibrated,group=sample,fill=grouping,color=grouping))+geom_point(alpha=0.1)+theme_classic()
ggp <- ggplotly(htmlPrep)
ggp
# hydroplate_wide_ph_calibration_06$grouping
htmlPrep <- ggplot(hydroplate_wide_ph_calibration_06,aes(x=curatedTime,y=intensity_calibrated,group=sample,fill=grouping,color=grouping))+geom_point()+theme_classic()+facet_wrap(.~grouping,ncol = 4)
ggp <- ggplotly(htmlPrep)
ggp
hydroplate_wide_ph_calibration_06_sub <- hydroplate_wide_ph_calibration_06 %>%
filter(row_number() %% 20 == 1)
htmlPrep <- ggplot(hydroplate_wide_ph_calibration_06_sub,aes(x=curatedTime,y=intensity_calibrated,group=sample,fill=grouping,color=grouping))+geom_point(alpha=0.1)+theme_classic()
ggp <- ggplotly(htmlPrep)
ggp
###-------------------------------------
##make a subsetting and averaging
###-------------------------------------
# TS <- zoo(c(4, 5, 7, 3, 9, 8))
# rollapply(TS, width = 3, by = 2, FUN = mean, align = "left")
final_subsetting <- data_frame()
for (groupsss in unique(hydroplate_wide_ph_calibration_06$grouping)) {
# hydroplate_wide_ph_calibration_06$
tmp <- hydroplate_wide_ph_calibration_06 %>% filter(grouping==groupsss)%>% arrange(curatedTime)
min <- rollapply(tmp$intensity_calibrated, width = 10, by = 5, FUN = min, align = "left")
mean <- rollapply(tmp$intensity_calibrated, width = 10, by = 5, FUN = mean, align = "left")
max <- rollapply(tmp$intensity_calibrated, width = 10, by = 5, FUN = max, align = "left")
namesss <- tmp %>% filter(row_number() %% 5 == 1)
namesssss <- namesss[1:length(max),"curatedTime"]
length(max)
tmp02 <- data_frame(time=namesssss,grouping=groupsss,minum=min,maxum=max,median=mean)
final_subsetting <- rbind(final_subsetting,tmp02)
}
table(final_subsetting$grouping)
final_subsetting_subset <- final_subsetting %>% filter(time<64800)
final_subsetting_subset$timeFinal <- final_subsetting_subset$time/3600
final_subsetting_subset_final <- final_subsetting_subset %>% filter(grouping!="complex")
minTime <- min(final_subsetting_subset_final$timeFinal)
colorsss <- c("#0081a7ff","#fec3b7ff","#9f9f92ff","#6d6466ff","#4e3d42ff")
colorss_02 <- c("#fec3b7ff","#9f9f92ff","#6d6466ff","#0081a7ff")
low_phased_ribbon <- ggplot()+
geom_ribbon(aes(x=timeFinal,ymin = minum , ymax = maxum,group=grouping,fill=grouping),data = final_subsetting_subset_final, alpha=.4)+
# scale_x_discrete( expand = c(0, 0)) +
# facet_wrap(~grouping,scales = "free")+
labs(y="pH",x="phased incubation time")+
scale_fill_manual(values = colorss_02)+
scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18))+
theme_classic()+
theme(rect = element_rect(fill = "transparent"), # all rectangles #axis.text.x = element_blank(),
legend.position="bottom",
legend.title = element_blank(),
# axis.text.x = element_blank(),
axis.text.y = element_text(size=8),
# axis.title = element_text(size=9),
plot.margin=unit(c(t = 0, r = 0.5, b = 0, l = 0.1),"cm")
)
low_phased_ribbon
svg("03_results/acidifaction_curve.svg",width=6,height=4)
low_phased_ribbon
dev.off()
##------------------
##look at offset
##------------------
colnames(hydroplate_wide_ph_calibration_06)
lagsss <- hydroplate_wide_ph_calibration_06 %>% dplyr::select(c("grouping","lagPhase")) %>% remove_rownames() %>% unique()
lagsss_mean <- aggregate(. ~grouping, data=lagsss, median, na.rm=TRUE)
lagsss_mean$hours <- lagsss_mean$lagPhase/3600
lagsss_mean$hours_corrected <- lagsss_mean$hours-21.4.2 Growth data
CFU count of the same time series
rmk202_strain_timeseries_Sheet1 <- read_csv("../data_zenodo/non_genomic_data/rmk202_growth_strain_timeseries.csv", skip = 1)
rmk202_strain_timeseries_Sheet1_long <- rmk202_strain_timeseries_Sheet1 %>% gather(.,"sampless","CFU",`0_BM`:`24_MR`)
rmk202_strain_timeseries_Sheet1_long$time <- as.numeric(str_split_fixed(rmk202_strain_timeseries_Sheet1_long$sampless, "_", 2)[,1])
rmk202_strain_timeseries_Sheet1_long$plate <- str_split_fixed(rmk202_strain_timeseries_Sheet1_long$sampless, "_", 2)[,2]
table(rmk202_strain_timeseries_Sheet1_long$plate)
rmk202_strain_timeseries_Sheet1_long$species <- plyr::revalue(rmk202_strain_timeseries_Sheet1_long$plate, c("BM"="total","M17X"="S. thermophilus","MR"="L. delbrueckii"))
rmk202_strain_timeseries_Sheet1_long$samplecount <- 1:nrow(rmk202_strain_timeseries_Sheet1_long)
plot_overtime <- ggplot(rmk202_strain_timeseries_Sheet1_long,aes(x=time,y=CFU,color=species,fill=species,group=sample,text =paste("sampleID=",samplecount)))+geom_line()+theme_classic()+facet_wrap(~species+group,ncol=5)
library(plotly)
ggp <- ggplotly(plot_overtime)
ggp
##----------------------
##exlude
#not grown
##----------------------
excludes <- c("228","276","277","282","286","288")
rmk202_strain_timeseries_Sheet1_long_cleaned <- rmk202_strain_timeseries_Sheet1_long %>% filter(!samplecount %in%excludes)
plot_overtime <- ggplot(rmk202_strain_timeseries_Sheet1_long_cleaned,aes(x=time,y=CFU,color=species,fill=species,group=sample,text =paste("sampleID=",samplecount)))+geom_line()+theme_classic()+facet_wrap(~species+group,ncol=5,scales = "free")
ggp <- ggplotly(plot_overtime)
ggp
ggplot(rmk202_strain_timeseries_Sheet1_long_cleaned,aes(x=time,y=CFU,color=species,fill=species,group=sample,text =paste("sampleID=",samplecount)))+geom_boxplot()+theme_classic()+facet_wrap(~species+group,ncol=5)+coord_trans(y="log2")
##----------------------
##mean , max ,median
##----------------------
group_growth_cfu <- data.frame()
for (typess in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$group)) {
for (timesss in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$time)) {
for (speccc in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$species)) {
maxxx <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% select(CFU) %>% max(na.rm = TRUE)
minnn <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% select(CFU)%>% min(na.rm = TRUE)
meannn <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss)%>% filter(species==speccc)%>% select(CFU) %>% colMeans(na.rm = TRUE)
tmps <- data.frame(sample=typess,species=speccc,time=timesss,min=minnn,max=maxxx,mean=meannn)
group_growth_cfu <- rbind(group_growth_cfu,tmps)
}
}
}
group_growth_cfu$time <- as.numeric(group_growth_cfu$time)
group_growth_cfu <- group_growth_cfu %>% filter(mean!="NaN")
plot_overtime <- ggplot(group_growth_cfu,aes(x=time,y=mean,color=sample,fill=sample,group=interaction(sample,species)))+geom_line()+theme_classic()+facet_wrap(~species,nrow=3)
plot_overtime
# ggp <- ggplotly(plot_overtime)
# ggp
##----------------------
##polish
##----------------------
group_growth_cfu_mean_species <- group_growth_cfu %>% filter(species!="total") %>% filter(sample!="all_strains")#%>% filter(!is.na(mean))
plot_overtime <- ggplot(group_growth_cfu_mean_species,aes(x=time,y=mean,color=sample,fill=sample,group=interaction(sample,species)))+geom_line()+theme_classic()+facet_wrap(~species,nrow=2)
plot_overtime
plot_overtime_RIBBON <- ggplot()+geom_ribbon(aes(x=time,ymin = min , ymax = max,group=interaction(sample,species),fill=sample),data = group_growth_cfu_mean_species, alpha=.1)+theme_classic()+facet_wrap(~species,nrow=2)+geom_line(aes(x=time,y=mean,color=sample,fill=sample,group=interaction(sample,species)),data = group_growth_cfu_mean_species)
plot_overtime_RIBBON
##----------------------
##polish
##----------------------
rmk202_strain_timeseries_Sheet1_long_cleaned_box <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(species!="total") %>% filter(group!="all_strains")#%>% filter(!is.na(mean))
rmk202_strain_timeseries_Sheet1_long_cleaned_box$time <- as.factor(rmk202_strain_timeseries_Sheet1_long_cleaned_box$time)
rmk202_strain_timeseries_Sheet1_long_cleaned_box$species = factor(rmk202_strain_timeseries_Sheet1_long_cleaned_box$species, levels=c("S. thermophilus" ,"L. delbrueckii"))
plot_overtime_box <- ggplot(rmk202_strain_timeseries_Sheet1_long_cleaned_box,aes(x=time,y=CFU,fill=group))+geom_boxplot(alpha=.3,color="grey")+theme_classic()+facet_wrap(~species,nrow=2)
plot_overtime_box
rmk202_strain_timeseries_Sheet1_long_cleaned_box$groupingFinal <- as.factor(paste0(rmk202_strain_timeseries_Sheet1_long_cleaned_box$species,"_",rmk202_strain_timeseries_Sheet1_long_cleaned_box$group))
levels(rmk202_strain_timeseries_Sheet1_long_cleaned_box$groupingFinal)
rmk202_strain_timeseries_Sheet1_long_cleaned_box$groupingFinal = factor(rmk202_strain_timeseries_Sheet1_long_cleaned_box$groupingFinal, levels=c("S. thermophilus_rmk" ,"S. thermophilus_pairwise","S. thermophilus_strepto" ,"S. thermophilus_lacto","L. delbrueckii_rmk","L. delbrueckii_pairwise","L. delbrueckii_lacto","L. delbrueckii_strepto"))
plot_overtime_box <- ggplot(rmk202_strain_timeseries_Sheet1_long_cleaned_box,aes(x=time,y=CFU,fill=groupingFinal))+geom_boxplot(alpha=.3,color="grey")+theme_classic()
plot_overtime_box
##----------------------
##model the data with bayani
##----------------------
Sys.sleep(5)
p <- c(y0 = 500000, mumax = 1, K = 800000000,h0=8.044679)
# rmk202_strain_timeseries_Sheet1_long_cleaned_box$time <- as.numeric(rmk202_strain_timeseries_Sheet1_long_cleaned_box$time)
rmk202_strain_timeseries_Sheet1_long_cleaned_model <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(species!="total") %>% filter(group!="all_strains")
rmk202_strain_timeseries_Sheet1_long_cleaned_model <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(!is.na(CFU))
rmk202_strain_timeseries_Sheet1_long_cleaned_model$groupingFinal <- as.factor(paste0(rmk202_strain_timeseries_Sheet1_long_cleaned_model$species,"_",rmk202_strain_timeseries_Sheet1_long_cleaned_model$group))
##----------model1
many_baranyi_sub <- all_growthmodels(
CFU ~ grow_baranyi(time, parms) | groupingFinal,
data = rmk202_strain_timeseries_Sheet1_long_cleaned_model, p=p,ncores = 8 )
results(many_baranyi_sub)
par(mfrow = c(12, 8))
par(mar = c(1, 1, 1, 1))
plot(many_baranyi_sub)
many_baranyi2_res <- results(many_baranyi_sub)
many_baranyi2_res
many_baranyi2_res_preped <- many_baranyi2_res
many_baranyi2_res_preped$mumax <- -many_baranyi2_res_preped$mumax
summary(many_baranyi2_res_preped)
table(many_baranyi2_res_preped$name)
mumax_plot <- ggplot(many_baranyi2_res_preped,aes(x=groupingFinal,y=mumax))+geom_boxplot()+theme_classic()+labs(x="",y="maximum pH decrease")
k0plot <- ggplot(many_baranyi2_res_preped,aes(x=groupingFinal,y=K))+geom_boxplot()+theme_classic()+labs(x="",y="lowest pH")
mumax_plot + k0plot+plot_layout(nrow = 2)
###---------------------
#boxplot
###---------------------
library(growthcurver)
fill_colers <- c("grey77","grey55","red","grey88","grey55","orange")
colorr_colers <- c("red","red","red","orange","orange","orange")
plot_overtime_box <- ggplot()+geom_boxplot(data=rmk202_strain_timeseries_Sheet1_long_cleaned_box,aes(x=time,y=CFU,fill=groupingFinal,color=groupingFinal),alpha=.2)+scale_fill_manual(values=fill_colers)+scale_color_manual(values=colorr_colers)+theme_classic()+theme(legend.position = "bottom")
plot_overtime_box
fill_colers <- c("grey77","grey55","red","grey88","grey55","orange")
colorr_colers <- c("grey77","grey77","grey77","grey55","grey55","grey55")
plot_overtime_box <- ggplot()+geom_boxplot(data=rmk202_strain_timeseries_Sheet1_long_cleaned_box,aes(x=time,y=CFU,fill=groupingFinal,color=groupingFinal),alpha=.3)+scale_fill_manual(values=colorr_colers)+scale_color_manual(values=fill_colers)+theme_classic()+theme(legend.position = "bottom")+facet_wrap(~time+species,ncol=14,scales = "free_x")
plot_overtime_box
rmk202_strain_timeseries_Sheet1_long_cleaned_box$time
##----------------------
##ribbon of interquartile range
##----------------------
group_growth_cfu_quantiles <- data.frame()
for (typess in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$group)) {
for (timesss in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$time)) {
for (speccc in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$species)) {
tmp <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% dplyr::select(CFU) %>% quantile(na.rm=TRUE) %>% as.data.frame()
s25ss <- tmp[2,1]
s75ss <- tmp[4,1]
meannn <- tmp[3,1]
# quantile(x)
#
# maxxx <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% select(CFU) %>% max(na.rm = TRUE)
# minnn <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% select(CFU)%>% min(na.rm = TRUE)
# meannn <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss)%>% filter(species==speccc)%>% select(CFU) %>% colMeans(na.rm = TRUE)
#
tmps <- data.frame(sample=typess,species=speccc,time=timesss,twentyfifths=s25ss,seventhts=s75ss,mean=meannn)
#
group_growth_cfu_quantiles <- rbind(group_growth_cfu_quantiles,tmps)
}
}
}
# group_growth_cfu_quantiles
group_growth_cfu_quantiles_spec <- group_growth_cfu_quantiles %>% filter(species!="total") %>% filter(sample!="all_strains")%>% filter(!is.na(mean))
# plot_overtime <- ggplot(group_growth_cfu_quantiles_spec,aes(x=time,y=mean,color=sample,fill=sample,group=interaction(sample,species)))+geom_line()+theme_classic()+facet_wrap(~species,nrow=2)
# plot_overtime
group_growth_cfu_quantiles_spec$groupingFinal <- as.factor(paste0(group_growth_cfu_quantiles_spec$species,"_",group_growth_cfu_quantiles_spec$sample))
group_growth_cfu_quantiles_spec$groupingFinal = factor(group_growth_cfu_quantiles_spec$groupingFinal, levels=c("S. thermophilus_rmk" ,"S. thermophilus_pairwise","S. thermophilus_strepto" ,"S. thermophilus_lacto","L. delbrueckii_rmk","L. delbrueckii_pairwise","L. delbrueckii_lacto","L. delbrueckii_strepto"))
# fill_colers <- c("grey77","grey55","red","grey88","grey55","orange")
fill_colers <- c("grey77","#97BC62FF","red","grey88","#339E66FF","orange")
# colorr_colers <- c("grey77","grey77","grey77","grey55","grey55","grey55")
fill_colers <- c("#FC766AFF","#783937FF","#F1AC88FF","#339E66FF","#078282FF","#95DBE5FF")
maxValue <- max(group_growth_cfu_quantiles_spec$seventhts)
plot_overtime_RIBBON <- ggplot()+geom_ribbon(aes(x=time,ymin = twentyfifths , ymax = seventhts,group=interaction(sample,species),fill=groupingFinal),data = group_growth_cfu_quantiles_spec, alpha=.2)+theme_classic()+lims(x=c(0,24),y=c(0,maxValue))+scale_fill_manual(values=fill_colers)+scale_color_manual(values=fill_colers)+scale_x_continuous(breaks =c(0,6,12,18,24),labels=c(0,6,12,18,24))
plot_overtime_RIBBON
###models--------------quartile
group_growth_cfu_quantiles_spec$twentyfifths
df.predicted_model_upper <- data.frame()
df.predicted_model_lower <- data.frame()
for (samplezzz in unique(group_growth_cfu_quantiles_spec$groupingFinal)){
tmp <- group_growth_cfu_quantiles_spec %>% filter(groupingFinal==samplezzz)
# tmp$time <- as.double(tmp$time)
model.wt <- SummarizeGrowth(tmp$time, tmp$seventhts)
model.wt_lower <- SummarizeGrowth(tmp$time, tmp$twentyfifths)
# predict(model.wt$model)
# model.wt$data
tt <- seq(0,24, length=50)
# predict(model.wt$model,newdata=list(t=tt))
# data(model.wt$model)
# df.predicted <- data.frame(time = tmp$time, pred.wt = predict(model.wt$model,))
tmp <- data.frame(group=samplezzz,time = tt, pred.wt_upper = predict(model.wt$model,newdata=list(t=tt)), pred.wt_lower = predict(model.wt_lower$model,newdata=list(t=tt)))
df.predicted_model_upper <- rbind(df.predicted_model_upper,tmp)
# plot_overtime_box + geom_line(data=df.predicted, aes(x=time,y=pred.wt), color="red")
}
logCFU <- ggplot()+geom_ribbon(aes(x=time,ymin = pred.wt_lower , ymax = pred.wt_upper,group=group,fill=group),data = df.predicted_model_upper, alpha=.2)+theme_classic()+scale_y_continuous(trans = 'log10')+labs(y="CFU/ml")#+coord_trans(y="log10")
nonLOG <- ggplot()+geom_ribbon(aes(x=time,ymin = pred.wt_lower , ymax = pred.wt_upper,group=group,fill=group),data = df.predicted_model_upper, alpha=.2)+theme_classic()+labs(y="CFU/ml")
logCFU+nonLOG+plot_layout(nrow = 2)
##----------------------------------
##modelling
##----------------------------------
##----------model2
df.predicted_model <- data.frame()
for (samplezzz in unique(rmk202_strain_timeseries_Sheet1_long_cleaned_model$groupingFinal)){
tmp <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal==samplezzz)
# tmp$time <- as.double(tmp$time)
model.wt <- SummarizeGrowth(tmp$time, tmp$CFU)
# predict(model.wt$model)
# model.wt$data
tt <- seq(0,24, length=50)
# predict(model.wt$model,newdata=list(t=tt))
# data(model.wt$model)
# df.predicted <- data.frame(time = tmp$time, pred.wt = predict(model.wt$model,))
tmp <- data.frame(group=samplezzz,time = tt, pred.wt = predict(model.wt$model,newdata=list(t=tt)))
df.predicted_model <- rbind(df.predicted_model,tmp)
# plot_overtime_box + geom_line(data=df.predicted, aes(x=time,y=pred.wt), color="red")
}
df.predicted_model$group = factor(df.predicted_model$group, levels=c("S. thermophilus_rmk" ,"S. thermophilus_pairwise","S. thermophilus_strepto" ,"S. thermophilus_lacto","L. delbrueckii_rmk","L. delbrueckii_pairwise","L. delbrueckii_lacto","L. delbrueckii_strepto"))
df.predicted_model$species <- str_split_fixed(df.predicted_model$group, "_", 2)[,1]
plot_overtime_model <- ggplot()+ geom_line(data=df.predicted_model, aes(x=time,y=pred.wt,group=group,color=group,linetype=species),size =1.25)+theme_classic()+lims(x=c(0,24),y=c(0,maxValue))+scale_color_manual(values=fill_colers)+scale_linetype_manual(values=c("dashed", "dotted"))+scale_x_continuous(breaks =c(0,6,12,18,24),labels=c(0,6,12,18,24))
plot_overtime_model
#
plot_overtime_RIBBON + plot_overtime_model+plot_layout(nrow = 2)
svg("03_results/growth_curve.svg",width=8,height=8)
#
plot_overtime_RIBBON + plot_overtime_model+plot_layout(nrow = 2)
dev.off()
##--------------------------------
##phase
##--------------------------------
df.predicted_model$groupsss <- str_split_fixed(df.predicted_model$group, "_", 2)[,2]
df.predicted_model$group
lagsss_mean
lagsss_mean$grouping_new <- plyr::revalue(lagsss_mean$grouping, c("lactobacillus"="lacto", "starterCulture"="rmk", "streptococcus"="strepto"))
lagsss_mean_prepped <- lagsss_mean %>% dplyr::select(c("grouping_new","hours","hours_corrected"))
df.predicted_model_extended <- merge(df.predicted_model,lagsss_mean_prepped,by.x="groupsss",by.y="grouping_new")
# df.predicted_model_extended$timeCurated <- df.predicted_model_extended$time -df.predicted_model_extended$hours_corrected
df.predicted_model_extended$timeCurated <- ifelse(df.predicted_model_extended$groupsss=="rmk",df.predicted_model_extended$time -df.predicted_model_extended$hours,df.predicted_model_extended$time -df.predicted_model_extended$hours_corrected)
df.predicted_model_extended <- df.predicted_model_extended %>% filter(timeCurated<=18) %>% filter(timeCurated>=minTime)
table(df.predicted_model_extended$group)
df.predicted_model_extended$typess <- plyr::revalue(df.predicted_model_extended$group, c("S. thermophilus_rmk"="RMK202", "L. delbrueckii_rmk"="RMK202","S. thermophilus_pairwise"="pairwise","L. delbrueckii_pairwise"="pairwise","S. thermophilus_strepto"="isolate","L. delbrueckii_lacto"="isolate"))
table(df.predicted_model_extended$typess)
# plot_overtime_model <- ggplot()+ geom_line(data=df.predicted_model_extended, aes(x=timeCurated,y=pred.wt,group=group,color=group,linetype=species),size =1.25)+theme_classic()+lims(x=c(minTime,18),y=c(0,maxValue))+labs(y="CFU/ml")+scale_color_manual(values=fill_colers)+scale_linetype_manual(values=c("dashed", "dotted","solid"))+scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18)) +theme(legend.position = "none")#+coord_trans(y="log2")
# plot_overtime_model
colorsss_01 <- c("#fec3b7ff","#0081a7ff")
plot_overtime_model <- ggplot()+ geom_line(data=df.predicted_model_extended, aes(x=timeCurated,y=pred.wt,group=group,color=species,linetype=typess),size =1.25)+theme_classic()+lims(x=c(minTime,18),y=c(0,maxValue))+labs(y="CFU/ml")+scale_color_manual(values=colorsss_01)+scale_linetype_manual(values=c("dashed", "dotted","solid"))+scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18)) #+theme(legend.position = "none")#+coord_trans(y="log2")
plot_overtime_model
##-------------------Ribbon
group_growth_cfu_quantiles_spec$groupingFinal <- as.factor(paste0(group_growth_cfu_quantiles_spec$species,"_",group_growth_cfu_quantiles_spec$sample))
group_growth_cfu_quantiles_spec$groupingFinal = factor(group_growth_cfu_quantiles_spec$groupingFinal, levels=c("S. thermophilus_rmk" ,"S. thermophilus_pairwise","S. thermophilus_strepto" ,"S. thermophilus_lacto","L. delbrueckii_rmk","L. delbrueckii_pairwise","L. delbrueckii_lacto","L. delbrueckii_strepto"))
# df.predicted_model$species <- str_split_fixed(group_growth_cfu_quantiles_spec$group, "_", 2)[,1]
# fill_colers <- c("grey77","grey55","red","grey88","grey55","orange")
fill_colers <- c("grey77","#97BC62FF","red","grey88","#339E66FF","orange")
# colorr_colers <- c("grey77","grey77","grey77","grey55","grey55","grey55")
fill_colers <- c("#FC766AFF","#783937FF","#F1AC88FF","#339E66FF","#078282FF","#95DBE5FF")
maxValue <- max(group_growth_cfu_quantiles_spec$seventhts)
minnsValue <- min(group_growth_cfu_quantiles_spec$seventhts)
group_growth_cfu_quantiles_spec_curated <- merge(group_growth_cfu_quantiles_spec,lagsss_mean_prepped,by.x="sample",by.y="grouping_new")
# group_growth_cfu_quantiles_spec_curated$timeCurated <- group_growth_cfu_quantiles_spec_curated$time -group_growth_cfu_quantiles_spec_curated$hours_corrected
group_growth_cfu_quantiles_spec_curated$timeCurated <- ifelse(group_growth_cfu_quantiles_spec_curated$sample=="rmk",group_growth_cfu_quantiles_spec_curated$time -group_growth_cfu_quantiles_spec_curated$hours,group_growth_cfu_quantiles_spec_curated$time -group_growth_cfu_quantiles_spec_curated$hours_corrected)
group_growth_cfu_quantiles_spec_curated <- group_growth_cfu_quantiles_spec_curated %>% filter(timeCurated>=minTime)#%>% filter(timeCurated<=18) %>% filter(timeCurated>=minTime)
# plot_overtime_RIBBON <- ggplot()+geom_ribbon(aes(x=timeCurated,ymin = twentyfifths , ymax = seventhts,group=interaction(sample,species),fill=groupingFinal),data = group_growth_cfu_quantiles_spec_curated, alpha=.2)+theme_classic()+lims(x=c(minTime,18),y=c(minnsValue,maxValue))+scale_fill_manual(values=fill_colers)+scale_color_manual(values=fill_colers)+scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18))+theme(legend.position = "none")#+coord_trans(y="log2")
# plot_overtime_RIBBON
# ,y=c(1,maxValue)
colorsss <- c("#0081a7ff","#fec3b7ff","#9f9f92ff","#6d6466ff","#4e3d42ff")
colorsss_01 <- c("#0081a7ff","#fec3b7ff")
plot_overtime_RIBBON <- ggplot()+geom_ribbon(aes(x=timeCurated,ymin = twentyfifths , ymax = seventhts,group=interaction(sample,species),fill=species),data = group_growth_cfu_quantiles_spec_curated, alpha=.2)+theme_classic()+lims(x=c(minTime,18),y=c(minnsValue,maxValue))+scale_fill_manual(values=colorsss_01)+scale_color_manual(values=colorsss_01)+scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18))+theme(legend.position = "none")#+coord_trans(y="log2")
plot_overtime_RIBBON
# ,y=c(1,maxValue)
# plot_overtime_RIBBON + plot_overtime_model+plot_layout(nrow = 2)
plot_overtime_RIBBON + plot_overtime_model+(low_phased_ribbon+theme(legend.position = "none"))+plot_layout(nrow = 3)
svg("03_results/growth_curve_02.svg",width=6,height=9)
#
plot_overtime_RIBBON + (plot_overtime_model+theme(legend.position = "none"))+(low_phased_ribbon+theme(legend.position = "none"))+plot_layout(nrow = 3)
dev.off()
##--------------------------------
##stats
##compare final growth value
##--------------------------------
table(rmk202_strain_timeseries_Sheet1_long_cleaned_model$group)
table(rmk202_strain_timeseries_Sheet1_long_cleaned_model$Method)
table(rmk202_strain_timeseries_Sheet1_long_cleaned_model$groupingFinal)
table(rmk202_strain_timeseries_Sheet1_long_cleaned_model$time)
ldel_vergleich_final_01 <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal=="L. delbrueckii_lacto") %>% filter(time=="18")
ldel_vergleich_final_02 <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal %in% c("L. delbrueckii_pairwise","L. delbrueckii_rmk")) %>% filter(time=="18")
t.test(ldel_vergleich_final_01$CFU,ldel_vergleich_final_02$CFU)
###between Sterm and Ldel
Sterm_vergleich_final_01 <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal %in% c("S. thermophilus_pairwise","S. thermophilus_rmk")) %>% filter(time=="18")
# ldel_vergleich_final_02 <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal %in% c("L. delbrueckii_pairwise","L. delbrueckii_rmk")) %>% filter(time=="18")
t.test(Sterm_vergleich_final_01$CFU,ldel_vergleich_final_02$CFU)1.4.3 Metabolomics
Here, I analyse the metabolomics data. I received from the GC/MS of Pascal an Yihelene on 13.8.2020.
X20200810_Vincent_cultures_profinder_untargeted <- read.xlsx("../data_zenodo/non_genomic_data/20200810_Vincent_cultures_profinder_untargeted.xlsx")
pca_res <- prcomp(df, scale. = TRUE)
Metabolites_information_reduced <- read_delim("../data_zenodo/non_genomic_data/Metabolites_information_reduced.csv", "\t", escape_double = FALSE, trim_ws = TRUE)
nrow(Metabolites_information_reduced)
par(mfrow=c(1,1))
hist(Metabolites_information_reduced$`RT (avg)`)
Metabolites_information_reduced_goodOnes <- Metabolites_information_reduced %>% filter(`RT (avg)`<46.6)
nrow(Metabolites_information_reduced_goodOnes)
# colnames(X20200810_Vincent_cultures_profinder_untargeted)[1:4]
X20200810_Vincent_cultures_profinder_untargeted <- X20200810_Vincent_cultures_profinder_untargeted %>% dplyr::select(c(colnames(X20200810_Vincent_cultures_profinder_untargeted)[1:4],Metabolites_information_reduced_goodOnes$compounds))
# Metabolites_information_reduced_goodOnes
colnames(X20200810_Vincent_cultures_profinder_untargeted)[1:3,1:10]
X20200810_Vincent_cultures_profinder_untargeted[1:3,1:10]
##------------------remove zero columns
X20200810_Vincent_cultures_profinder_untargeted_new <- X20200810_Vincent_cultures_profinder_untargeted[,!(colnames(X20200810_Vincent_cultures_profinder_untargeted)%in% names(which(colSums(X20200810_Vincent_cultures_profinder_untargeted[,5:ncol(X20200810_Vincent_cultures_profinder_untargeted)])==0)))]
dim(X20200810_Vincent_cultures_profinder_untargeted)
dim(X20200810_Vincent_cultures_profinder_untargeted_new)
# which(colSums(X20200810_Vincent_cultures_profinder_untargeted_new[,5:ncol(X20200810_Vincent_cultures_profinder_untargeted_new)])==0)
ploting_data
ploting_data <- metabolo.pca$x %>% as.data.frame()
ploting_data$sample <- X20200810_Vincent_cultures_profinder_untargeted$sample
ploting_data$biological_duplicate <- X20200810_Vincent_cultures_profinder_untargeted$biological_duplicate
ploting_data$analytical_duplicate <- X20200810_Vincent_cultures_profinder_untargeted$analytical_duplicate
ploting_data$grouping <- substring(X20200810_Vincent_cultures_profinder_untargeted$sample, 1, 1)
table(ploting_data$grouping)
ploting_data$grouping_long <- plyr::revalue(ploting_data$grouping, c("A"="All strains combined","L"="L. delbrueckii only","S"="S. thermophilus only","R"="original RMK202 starter culture","M"="pairwise strain mix"))
# ##-----------------------
# ##----------------------NON-ADJUSTED SCALES
# ##-----------------------
metabolo.pca <- prcomp(X20200810_Vincent_cultures_profinder_untargeted_new[,5:ncol(X20200810_Vincent_cultures_profinder_untargeted_new)], center = FALSE,scale. = FALSE)
colorsss <- c("0081a7ff","fec3b7ff","9f9f92ff","6d6466ff","4e3d42ff")
unique(ploting_data$grouping_long)
colorsss <- c("#4e3d42ff","#fec3b7ff","#9f9f92ff","#6d6466ff","#0081a7ff")
pcaFINAL <- ggbiplot(metabolo.pca,var.axes=FALSE,groups=ploting_data$grouping_long,ellipse=TRUE,obs.scale = 1,size=2)+theme_classic()+scale_color_manual(values=colorsss)+scale_fill_manual(values=colorsss)
# pcaFINAL
svg("03_results/20200810_Vincent_cultures_pca_FINAL.svg",width=8,height=5)
pcaFINAL
dev.off()1.5 Figure 5
Figure 5. CRISPR spacer diversity of L. delbrueckii and S. thermophilus. A) The correlation of fraction of shared CRISPR spacers and ANI of all L. delbrueckii and S. thermophilus with the corresponding densities and heatmaps on the x and y-axis. B) The heatmap of the genomic and CRISPR spacer diversities of all S.thermophilus illustrated with ANI (top heatmap; from white to red) and percent shared CRISPR spacers (bottom heatmap; from white to blue) C) The amount of metagenomic and genomic CRISPR spacers according to the five arrays.
1.5.1 ANI
FastANI was used to calculate all pairwise ANI values
###-----------------------------------------
##make list
###-----------------------------------------
for species in $(echo "Ldel Sterm")
do
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}/
rm /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}/fastani_rmk202.txt
for genomssss in $(ls /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/FNA_all/ |grep ".fna$")
do
echo "/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/"${species}"/FNA_all/"${genomssss} >> /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}/fastani_rmk202.txt
done #genomes
done # species
###-----------------------------------------
##fastani
###-----------------------------------------
for species in $(echo "Ldel Sterm")
do
fastANI --fragLen 1000 -t 37 --ql /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}/fastani_rmk202.txt \
--rl /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}/fastani_rmk202.txt \
-o /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}/fastaANI_comparision.txt
done # species##==============================
##Sterm
final_ANI <- read_delim("../data_zenodo/non_genomic_data//fastaANI_comparision_sterm.txt", "\t", escape_double = FALSE, col_names = c("GenomeA","GenomeB","ANI","mappedFragemnts","totFragemnts"), trim_ws = TRUE)
final_ANI$coverage_ANI <- 100*(final_ANI$mappedFragemnts/final_ANI$totFragemnts)
final_ANI$GenomeA <- str_remove(final_ANI$GenomeA,"/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/FNA_all/") %>% str_remove(.,".fna") %>% str_remove(.,".fasta")%>% str_remove(.,"L_I_202_")%>% str_remove(.,"S_O_202_")%>% str_remove(.,"S_I_202_")
final_ANI$GenomeB <- str_remove(final_ANI$GenomeB,"/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/FNA_all/") %>% str_remove(.,".fna") %>% str_remove(.,".fasta")%>% str_remove(.,"L_I_202_")%>% str_remove(.,"S_O_202_")%>% str_remove(.,"S_I_202_")
# final_ANI$GenomeB <- revalue(final_ANI$GenomeB, c("S50"="mst1","S72"="mst2"))
# final_ANI$GenomeA <- revalue(final_ANI$GenomeA, c("S50"="mst1","S72"="mst2"))
final_ANI$ANI <- round(final_ANI$ANI,digits = 2)
write.table(final_ANI,"../03_results/fastaANI_comparision_curated_sterm.txt",sep = "\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
##==============================
##Ldel
##==============================
library(readr)
library(plyr)
library(tidyverse)
final_ANI <- read_delim("../data_zenodo/non_genomic_data//fastaANI_comparision_ldel.txt", "\t", escape_double = FALSE, col_names = c("GenomeA","GenomeB","ANI","mappedFragemnts","totFragemnts"), trim_ws = TRUE)
final_ANI$coverage_ANI <- 100*(final_ANI$mappedFragemnts/final_ANI$totFragemnts)
final_ANI$GenomeA <- str_remove(final_ANI$GenomeA,"/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/FNA_all/") %>% str_remove(.,".fna") %>% str_remove(.,".fasta")%>% str_remove(.,"L_I_202_")%>% str_remove(.,"S_O_202_")%>% str_remove(.,"S_I_202_")
final_ANI$GenomeB <- str_remove(final_ANI$GenomeB,"/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/FNA_all/") %>% str_remove(.,".fna") %>% str_remove(.,".fasta")%>% str_remove(.,"L_I_202_")%>% str_remove(.,"S_O_202_")%>% str_remove(.,"S_I_202_")
# final_ANI$GenomeB <- revalue(final_ANI$GenomeB, c("S50"="mst1","S72"="mst2"))
# final_ANI$GenomeA <- revalue(final_ANI$GenomeA, c("S50"="mst1","S72"="mst2"))
final_ANI$ANI <- round(final_ANI$ANI,digits = 2)
final_ANI
write.table(final_ANI,"../03_results/fastaANI_comparision_curated_ldel.txt",sep = "\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
##------------1.5.3 metagenomic/genomic spacers
library(readr)
library(ggplot2)
# uniqueSpacers_count_Ref_wOLDstrains <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/ForDADA2/uniqueSpacers_count_Ref_wOLDstrains.txt","\t", escape_double = FALSE, trim_ws = TRUE)
uniqueSpacers_count_Ref_wOLDstrains <- read_delim("../data_zenodo/non_genomic_data//uniqueSpacers_count_Ref_wOLDstrains.txt","\t", escape_double = FALSE, trim_ws = TRUE)
uniqueSpacers_count_Ref_wOLDstrains$explained <- ifelse(uniqueSpacers_count_Ref_wOLDstrains$dadaSpacer>0&uniqueSpacers_count_Ref_wOLDstrains$Strains>0,"both Meta & strains",ifelse(
uniqueSpacers_count_Ref_wOLDstrains$dadaSpacer>0&uniqueSpacers_count_Ref_wOLDstrains$Strains==0,"explained only by meta","explained only by strains"
))
# table(uniqueSpacers_count_Ref_wOLDstrains$explained)
##-------------plot explained
# uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO
colorsss <- rev(c("darkcyan","darkturquoise","lightblue"))
uniqueSpacers_count_Ref_wOLDstrains$explained = factor(uniqueSpacers_count_Ref_wOLDstrains$explained, levels=c("explained only by meta","both Meta & strains","explained only by strains"))
uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO <- revalue(uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO, c("a2"="CR5","a1"="CR4"))
# uniqueSpacers_count_Ref_wOLDstrains$explained <- revalue(uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO, c("a2"="CRISPR array 2","a1"="CRISPR array 1","a3"="CRISPR array 3"))
table(uniqueSpacers_count_Ref_wOLDstrains$explained,uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO)
# svg("~/Desktop/Projects/2019_RMK202_analysis/plot/spacerexplained.svg",width=4,height=3)
# ggplot(uniqueSpacers_count_Ref_wOLDstrains,aes(x=ARRAYINFO,group=explained,fill=explained))+geom_bar()+theme_classic()+scale_fill_manual(values=colorsss)+theme(axis.title.x = element_blank(),axis.text.x =element_text(angle = 45, hjust = 1,size=9) )+labs(y="spacer counts",fill="")+scale_y_continuous(breaks =c(0,39,50,100,142,158),labels=c(0,39,50,100,142,158))
# dev.off()
##new and wider
svg("../03_results//spacerexplained_wide_Ldel.svg",width=4,height=3)
ggplot(uniqueSpacers_count_Ref_wOLDstrains,aes(x=ARRAYINFO,group=explained,fill=explained))+geom_bar()+theme_classic()+scale_fill_manual(values=colorsss)+theme(axis.title.x = element_blank(),axis.text.x =element_text(angle = 45, hjust = 1,size=9) )+labs(y="spacer counts",fill="")+scale_y_continuous(breaks =c(0,10,27,39,92,117,139,163),labels=c(0,10,27,39,92,117,139,163))
dev.off()
##------------------------
#what are the only strains CRISPRs
##------------------------
ONlyIsolateSPACERS <- uniqueSpacers_count_Ref_wOLDstrains %>% filter(explained=="explained only by strains")
merge(ONlyIsolateSPACERS,spacer_Infos_Sterm_final,by.x="ClusterINFO",by.y="ClusterName")
merge(ONlyIsolateSPACERS,uniqueSpacersss_extended,by.x="ClusterINFO",by.y="ClusterName")
##------------------------
#include Sterm
##------------------------
uniqueSpacers_count_Ref_wOLDstrains_sterm <- read_delim("../data_zenodo/non_genomic_data//uniqueSpacers_count_Ref_wOLDstrains_both.txt","\t", escape_double = FALSE, trim_ws = TRUE)
uniqueSpacers_count_Ref_wOLDstrains_sterm$explained <- ifelse(uniqueSpacers_count_Ref_wOLDstrains_sterm$dadaSpacer>0&uniqueSpacers_count_Ref_wOLDstrains_sterm$Strains>0,"both Meta & strains",ifelse(
uniqueSpacers_count_Ref_wOLDstrains_sterm$dadaSpacer>0&uniqueSpacers_count_Ref_wOLDstrains_sterm$Strains==0,"explained only by meta","explained only by strains"
))
# table(uniqueSpacers_count_Ref_wOLDstrains$explained)
##-------------plot explained
# uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO
colorsss <- rev(c("darkcyan","darkturquoise","lightblue"))
uniqueSpacers_count_Ref_wOLDstrains_sterm$explained = factor(uniqueSpacers_count_Ref_wOLDstrains_sterm$explained, levels=c("explained only by meta","both Meta & strains","explained only by strains"))
uniqueSpacers_count_Ref_wOLDstrains_sterm$ARRAYINFO <- revalue(uniqueSpacers_count_Ref_wOLDstrains_sterm$ARRAYINFO, c("a2"="CR2","a1"="CR1","a3"="CR3"))
# uniqueSpacers_count_Ref_wOLDstrains$explained <- revalue(uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO, c("a2"="CRISPR array 2","a1"="CRISPR array 1","a3"="CRISPR array 3"))
table(uniqueSpacers_count_Ref_wOLDstrains_sterm$explained,uniqueSpacers_count_Ref_wOLDstrains_sterm$ARRAYINFO)
##------------------------
#merge
##------------------------
uniqueSpacers_count_Ref_wOLDstrains_merge <- rbind(uniqueSpacers_count_Ref_wOLDstrains_sterm,uniqueSpacers_count_Ref_wOLDstrains)
table(uniqueSpacers_count_Ref_wOLDstrains_merge$explained,uniqueSpacers_count_Ref_wOLDstrains_merge$ARRAYINFO)
ggplot(uniqueSpacers_count_Ref_wOLDstrains_merge,aes(x=ARRAYINFO,group=explained,fill=explained))+geom_bar()+theme_classic()+scale_fill_manual(values=colorsss)+theme(axis.title.x = element_blank(),axis.text.x =element_text(angle = 45, hjust = 1,size=9) )+labs(y="spacer counts",fill="")+scale_y_continuous(breaks =c(0,10,17,27,39,54,92,101,117,139,163),labels=c(0,10,17,27,39,54,92,101,117,139,163))
svg("../03_results//spacerexplained_wide_merged.svg",width=5,height=3)
ggplot(uniqueSpacers_count_Ref_wOLDstrains_merge,aes(x=ARRAYINFO,group=explained,fill=explained))+geom_bar()+theme_classic()+scale_fill_manual(values=colorsss)+theme(axis.title.x = element_blank(),axis.text.x =element_text(angle = 45, hjust = 1,size=9) )+labs(y="spacer counts",fill="")+scale_y_continuous(breaks =c(0,10,17,27,39,54,92,101,117,139,163),labels=c(0,10,17,27,39,54,92,101,117,139,163))
dev.off()1.6 Figure 6
Figure 6. Characteristics of the phages identified in the cheese starter cultures. A) Gene annotation of the two Streptococcus starter culture phages, RMK202_1 and RMK202_2, and the two closest relatives (illustrated in lighter colors). Protein similarity between genes are indicated in grey (80-95% identity) and black (95-100%). B) Relative abundance of bacteria and phages over all metagenomic samples based on genome read coverage. C) Fraction of Streptococcus genomes with an integrated phage as based on the read coverage of phage-bacteria spanning regions relative to the coverage of the S. thermophilus genome. D) Fraction of Streptococcus phages which show signs of integration as based on the read coverage of phage-bacteria spanning regions relative to the coverage of the Streptococcus phage genomes. E) The number of spacers mapping against the different phage types. F) The Streptococcus phage network with the protospacer containing phages colored or labeled according to phage type. G) The spacer abundance versus the protospacer abundance from all phage spacers. The database specific linear regression and distributions are indicated in the figure and the axis figures accordingly.
1.6.1 Phage annotation
prepare for aligment blast
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/06_phage/startaligned/Prodigal
sed 's/ # /-/g' Streptococcus_phage_1_startAligned_Final_cleaned.faa | sed 's/-1-.*$//g' > Streptococcus_phage_1_startAligned_Final_cleaned_readyBLAST.faa
sed 's/ # /-/g' Streptococcus_phage_2_startAligned_Final_cleaned.faa | sed 's/-1-.*$//g' > Streptococcus_phage_2_startAligned_Final_cleaned_readyBLAST.faa
sed 's/ \[locus.*location=/-/g' s_term_sw30.faa |sed 's/\].*$//g'|sed 's/\.\./-/g' > s_term_sw30_readyBLAST.faa
sed 's/ \[locus.*location=/-/g' s_term_9874.faa |sed 's/\].*$//g'|sed 's/\.\./-/g' > s_term_9874_readyBLAST.faa
##-------------after alignment
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/06_phage/startaligned/alignments
grep "hits found" -A 1 Sterm_1_vs_2.txt |grep "^#" -v |grep "^-" -v |cut -f 1,2,3 > Sterm_1_vs_2_cleaned.txt
grep "hits found" -A 1 Sterm_2_vs_sw30.txt |grep "^#" -v |grep "^-" -v |cut -f 1,2,3 > Sterm_2_vs_sw30_cleaned.txt
grep "hits found" -A 1 Sterm_9874_vs_1.txt |grep "^#" -v |grep "^-" -v |cut -f 1,2,3 > Sterm_9874_vs_1_cleaned.txtdownload hit table (txt) from online blast results
data(three_genes)
comparisons[[1]]$col <- apply_color_scheme(c(0.6, 0.4, 0.5), "grey")
comparisons
##-------------------------
##Streptococcus_phage_Sterm_1_vs_2
##-------------------------
library(readr)
Sterm_1_vs_2_cleaned <- read_delim("../data_zenodo/non_genomic_data//Sterm_1_vs_2_cleaned.txt", "\t", escape_double = FALSE, col_names = c("query","subject","score"), trim_ws = TRUE)
Sterm_1_vs_2_cleaned$start1 <- str_split_fixed(Sterm_1_vs_2_cleaned$query, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_1_vs_2_cleaned$end1 <- str_split_fixed(Sterm_1_vs_2_cleaned$query, fixed("-"), 4)[,3]%>% as.numeric()
Sterm_1_vs_2_cleaned$start2 <- str_split_fixed(Sterm_1_vs_2_cleaned$subject, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_1_vs_2_cleaned$end2 <- str_split_fixed(Sterm_1_vs_2_cleaned$subject, fixed("-"), 4)[,3]%>% as.numeric()
Sterm_1_vs_2_cleaned$col <- ifelse(Sterm_1_vs_2_cleaned$score<95,"grey66", ifelse(Sterm_1_vs_2_cleaned$score<99,"grey88","black"))
Sterm_1_vs_2_cleaned_final <- Sterm_1_vs_2_cleaned %>% add_column(.,"direction"="1") %>% filter(score>80) %>% dplyr::select(start1,end1,start2,end2,direction,col) %>% as.comparison()
##-------------------------
##Streptococcus_phage_Sterm_2_vs_sw30
##-------------------------
library(readr)
Sterm_2_vs_sw30_cleaned <- read_delim("../data_zenodo/non_genomic_data//Sterm_2_vs_sw30_cleaned.txt", "\t", escape_double = FALSE, col_names = c("query","subject","score"), trim_ws = TRUE)
Sterm_2_vs_sw30_cleaned$start1 <- str_split_fixed(Sterm_2_vs_sw30_cleaned$query, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_2_vs_sw30_cleaned$end1 <- str_split_fixed(Sterm_2_vs_sw30_cleaned$query, fixed("-"), 4)[,3]%>% as.numeric()
Sterm_2_vs_sw30_cleaned$start2 <- str_split_fixed(Sterm_2_vs_sw30_cleaned$subject, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_2_vs_sw30_cleaned$end2 <- str_split_fixed(Sterm_2_vs_sw30_cleaned$subject, fixed("-"), 4)[,3]%>% as.numeric()
Sterm_2_vs_sw30_cleaned$col <- ifelse(Sterm_2_vs_sw30_cleaned$score<95,"grey66", ifelse(Sterm_2_vs_sw30_cleaned$score<99,"grey88","black"))
Sterm_2_vs_sw30_cleaned_final <- Sterm_2_vs_sw30_cleaned %>% add_column(.,"direction"="1") %>% filter(score>80) %>% dplyr::select(start1,end1,start2,end2,direction,col) %>% as.comparison()
##-------------------------
##Streptococcus_phage_Sterm_9874_vs_1
##-------------------------
library(readr)
Sterm_9874_vs_1_cleaned <- read_delim("../data_zenodo/non_genomic_data//Sterm_9874_vs_1_cleaned.txt", "\t", escape_double = FALSE, col_names = c("query","subject","score"), trim_ws = TRUE)
Sterm_9874_vs_1_cleaned$start1 <- str_split_fixed(Sterm_9874_vs_1_cleaned$query, fixed("-"), 4)[,2] %>% as.numeric()
Sterm_9874_vs_1_cleaned$end1 <- str_split_fixed(Sterm_9874_vs_1_cleaned$query, fixed("-"), 4)[,3]%>% as.numeric()
Sterm_9874_vs_1_cleaned$start2 <- str_split_fixed(Sterm_9874_vs_1_cleaned$subject, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_9874_vs_1_cleaned$end2 <- str_split_fixed(Sterm_9874_vs_1_cleaned$subject, fixed("-"), 4)[,3]%>% as.numeric()
# Sterm_9874_vs_1_cleaned$col <- ifelse(Sterm_9874_vs_1_cleaned$score>95,"black","grey")
Sterm_9874_vs_1_cleaned$col <- ifelse(Sterm_9874_vs_1_cleaned$score<95,"grey66", ifelse(Sterm_9874_vs_1_cleaned$score<99,"grey88","black"))
Sterm_9874_vs_1_cleaned_final <- Sterm_9874_vs_1_cleaned %>% add_column(.,"direction"="1") %>% filter(score>80) %>% dplyr::select(start1,end1,start2,end2,direction,col) %>% as.comparison()
##-----------------
##bring togethere
##-----------------
comparisons_mine <- list(Sterm_9874_vs_1_cleaned_final,Sterm_1_vs_2_cleaned_final,Sterm_2_vs_sw30_cleaned_final)I annotated the phages by blasting all the proteins to the blast DB. Then I manually transfered the information. Now I will create a fill for genoplotR.
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/06_phage/startaligned/annotation/
echo -e "name\tstart\tend\tstrand\tcol\tfill\tgene_type" > Streptococcus_virus_9874_for_genoplotR.gff
awk -F " " '{OFS="\t"}{print $9 $10 $11 $12 $13 $14 $15,$4,$5,"1","black","arrows"}' Streptococcus_virus_9874_forR.gff | sed 's/product=.*;group=//g' |awk -F "\t" '{OFS="\t"}{print $1,$2,$3,$4,$5,$1,$6}' >> Streptococcus_virus_9874_for_genoplotR.gff
###--------------------
echo -e "name\tstart\tend\tstrand\tcol\tfill\tgene_type" > Streptococcus_phage_1_startAligned_Final_for_genoplotR.gff
awk -F " " '{OFS="\t"}{print $9 $10 $11 $12 $13 $14 $15,$4,$5,"1","black","arrows"}' Streptococcus_phage_1_startAligned_Final_forR.gff | sed 's/ID=.*;group=//g' |awk -F "\t" '{OFS="\t"}{print $1,$2,$3,$4,$5,$1,$6}' >> Streptococcus_phage_1_startAligned_Final_for_genoplotR.gff
###--------------------
echo -e "name\tstart\tend\tstrand\tcol\tfill\tgene_type" > Streptococcus_phage_2_startAligned_Final_for_genoplotR.gff
awk -F " " '{OFS="\t"}{print $9 $10 $11 $12 $13 $14 $15,$4,$5,"1","black","arrows"}' Streptococcus_phage_2_startAligned_Final_forR.gff | sed 's/ID=.*;group=//g' |awk -F "\t" '{OFS="\t"}{print $1,$2,$3,$4,$5,$1,$6}' >> Streptococcus_phage_2_startAligned_Final_for_genoplotR.gff
cat Streptococcus_phage_2_startAligned_Final_for_genoplotR.gff
###--------------------
echo -e "name\tstart\tend\tstrand\tcol\tfill\tgene_type" > Streptococcus_phage_SW30_for_genoplotR.gff
awk -F " " '{OFS="\t"}{print $9 $10 $11 $12 $13 $14 $15,$4,$5,"1","black","arrows"}' Streptococcus_phage_SW30_forR.gff | sed 's/product=.*;group=//g' |awk -F "\t" '{OFS="\t"}{print $1,$2,$3,$4,$5,$1,$6}' >> Streptococcus_phage_SW30_for_genoplotR.gfflibrary(genoPlotR)
library(plyr)
library(tidyverse)
##-------------------------
##Streptococcus_phage_SW30
##-------------------------
library(readr)
SW30 <- read_delim("../data_zenodo/non_genomic_data/Streptococcus_phage_SW30_for_genoplotR.gff", "\t", escape_double = FALSE, trim_ws = TRUE) %>% as.data.frame() #%>% filter(name!="spacer") #%>% select(-fill)
table(SW30$fill)
SW30$fill <- revalue(SW30$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88"))
table(SW30$fill)
table(SW30$col)
dna_seg1_SW30 <- dna_seg(SW30)
dna_segs_SW30 <- list(dna_seg1_SW30)
plot_gene_map(dna_segs=dna_segs_SW30)
##-------------------------
##Streptococcus_phage_2
##-------------------------
library(readr)
rmk_2 <- read_delim("../data_zenodo/non_genomic_data/Streptococcus_phage_2_startAligned_Final_for_genoplotR.gff", "\t", escape_double = FALSE, trim_ws = TRUE) %>% as.data.frame() #%>% filter(name!="spacer") #%>% select(-fill)
table(rmk_2$fill)
# rmk_2$fill <- revalue(rmk_2$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","Acr-like"="gold","baseplateprotein"="violet","Head-closureprotein"="purple","Integrase"="cyan"))
rmk_2$fill <- revalue(rmk_2$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","minorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","scaffoldingprotein"="pink","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","tail-associatedlysin"="blue","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","tailcompletionprotein"="blue","Integrase"="cyan","Acr-like"="gold","baseplateprotein"="violet","Head-closureprotein"="blue","repressor"="cyan","regulator"="grey88","antirepressor"="cyan"))
table(rmk_2$fill)
table(rmk_2$col)
dna_seg1_rmk_2 <- dna_seg(rmk_2)
dna_segs_rmk_2 <- list(dna_seg1_rmk_2)
plot_gene_map(dna_segs=dna_segs_rmk_2)
##-------------------------
##Streptococcus_phage_987
##-------------------------
library(readr)
phage_9874 <- read_delim("../data_zenodo/non_genomic_data/Streptococcus_virus_9874_for_genoplotR.gff", "\t", escape_double = FALSE, trim_ws = TRUE) %>% as.data.frame() #%>% filter(name!="spacer") #%>% select(-fill)
table(phage_9874$fill)
phage_9874$fill <- revalue(phage_9874$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","minorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","scaffoldingprotein"="pink","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","tail-associatedlysin"="blue","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","tailcompletionprotein"="blue","repressor"="cyan","regulator"="grey88","antirepressor"="cyan"))
table(phage_9874$fill)
table(phage_9874$col)
dna_seg1_9874 <- dna_seg(phage_9874)
dna_segs_9874 <- list(dna_seg1_9874)
plot_gene_map(dna_segs=dna_segs_9874)
##-------------------------
##Streptococcus_phage_2
##-------------------------
rmk_1 <- read_delim("../data_zenodo/non_genomic_data/Streptococcus_phage_1_startAligned_Final_for_genoplotR.gff", "\t", escape_double = FALSE, trim_ws = TRUE) %>% as.data.frame() #%>% filter(name!="spacer") #%>% select(-fill)
table(rmk_1$fill)
# rmk_2$fill <- revalue(rmk_2$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","Acr-like"="gold","baseplateprotein"="violet","Head-closureprotein"="purple","Integrase"="cyan"))
rmk_1$fill <- revalue(rmk_1$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","minorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","scaffoldingprotein"="pink","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","tail-associatedlysin"="blue","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","tailcompletionprotein"="blue","Integrase"="cyan","Acr-like"="gold","baseplateprotein"="violet","Head-closureprotein"="blue","repressor"="cyan","regulator"="grey88","antirepressor"="cyan"))
table(rmk_1$fill)
table(rmk_1$col)
dna_seg1_rmk_1 <- dna_seg(rmk_1)
dna_segs_rmk_1 <- list(dna_seg1_rmk_1)
plot_gene_map(dna_segs=dna_segs_rmk_1)
###--------------------
##merge
###--------------------
all_toegther_annotation <- list(dna_seg1_9874,dna_seg1_rmk_1,dna_seg1_rmk_2,dna_seg1_SW30)
plot_gene_map(dna_segs=all_toegther_annotation)with comparision
1.6.2 contig quantification
Here, we quantify not only the bacteria but also the phage abundance by taking the mapping coverage and normalise with the total read abundance.
# source("/home/vincent/Desktop/ScriptRepository/R_functions-master/R_functions/g_legend.R") ##for plotting only legends
##===================================
#-------------file import
read_count <- read_delim("../data_zenodo/non_genomic_data//all_2bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)
table(read_count$chr)
# read_count$species <- ifelse(read_count$chr=="L_del_phage_01","L_del_phage_01","S_term_phage_01")
table(read_count$chr)
read_count$geneCoverage <- (read_count$count*600)/read_count$geneLength
# ggplot(read_count,aes(y=geneCoverage,group=sort,color=sort,fill=sort))+geom_boxplot()+facet_grid(sample~species, scales="free")
library(dplyr)
# all_final <- read_count %>%
# group_by(sample,chr) %>%
# dplyr::summarize(median = median(geneCoverage))
#
all_final <- read_count %>%
group_by(sample,chr) %>%
dplyr::summarize(median = median(geneCoverage)) %>% filter(chr %in% c("CP046131","CP046134","Lactobacillus_phage_1","Streptococcus_phage_1","Streptococcus_phage_2"))
total_samples_sumTreatment <- aggregate(. ~sample, data=all_final[,c("sample","median")], sum, na.rm=TRUE)
all_final$total_coverage <- total_samples_sumTreatment[match(all_final$sample,total_samples_sumTreatment$sample),"median"]
all_final$percent_coverage <- 100*(all_final$median/all_final$total_coverage)
#
# all_final$species <- as.factor(all_final$species)
#
# levels(all_final$species)
# all_final$species <- factor(all_final$species, levels=rev(c("S_thermophilus_RMK202","S_term_plasmid_01","S_term_phage_01","L_delbrueckii_RMK202","L_del_plasmid_01","L_plasmid_RMK202","L_del_plasmid_02","L_del_phage_01")))
# all_final$species <- factor(all_final$species, levels=rev(c("S_thermophilus_RMK202","S_phage_RMK202","L_delbrueckii_RMK202","L_delbrueckii_plasmid_RMK202_01","L_plasmid_RMK202","L_phage_RMK202_01","L_phage_RMK202_02")))
# all_final$sample <- as.factor(all_final$sample)
table(all_final$sample)
all_final <- all_final %>% filter(! sample %in% c("th_K2_8h","di_K2_6h"))
all_final$sample <- factor(all_final$sample, levels=(c("lyo202_96","Lyo_202_2012","Konserve_202","Lyo_202_2014","RMK202","Versand_202","G1_6_18","G2_6_18","G3_6_18","G4_6_18","G5_6_18")))
# bac_final_02$sample <- revalue(bac_final_02$sample, c("lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
# bac_final_02$sample <- factor(bac_final_02$sample, levels=(c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2")))
# table(total_samples$phage) %>% length()
write.table(all_final,"../03_results//coverage_rmk202.tsv",sep = "\t",quote = FALSE,col.names = FALSE)
write.table(all_final,"../03_results//coverage_rmk202_n32.tsv",sep = "\t",quote = FALSE,col.names = FALSE)
# all_colours <-c("#C5F6FA" ,"#6CF5A3" ,"#36E37B" ,"#10B552", "darkorange", "#FAA0A0","#EB4D4D")
all_colours <-c("#C5F6FA" ,"#6CF5A3" ,"#36E37B" ,"#10B552", "darkorange", "#FAA0A0","#EB4D4D")
# mito_colours <- c("#AFBACC","#BE99AB") # sequential_hcl(5,palette="Grays")[c(4,3)]
# Sterm_colours <- c("#FCC2C2","#EB4D4D")# c("#FAA0A0","#EB4D4D") #c("#FF7D87","#E71D32")
# Ldel_colours <- c("#6CF5A3","#10B552") # c("#8CD211","#4C8400") # c("#5AA700","#2D660A") #brewer.pal(9,name="YlGnBu")[c(4,6)] #sequential_hcl(5,palette="Purples 3")[c(3,2)]
# lactococcus_colours <- c("#5AAAFA","#4178BE") #sequential_hcl(5,palette="Terrain 2")[c(3,2)]
# rest_colours <- c("#C5F6FA","#99E9F2","#66D9E8","#3BC9DB") #brewer.pal(9,name="YlOrBr")[c(5,7,8,9)] #sequential_hcl(5,palette="BluGrn")[2:5]
# colours_phages_02 <- c(rest_colours,lactococcus_colours,Ldel_colours,Sterm_colours,mito_colours)
##----------------change name
library(plyr)
library(dplyr)
all_final$sample <- revalue(all_final$sample, c("lyo202_96"="Lyo 1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018"))
##----------------plot
# all_final <- all_final %>% filter(!sample %in% c("cheesemaking\nday1","cheesemaking\nday2"))
levels(all_final$chr)
# all_final$species <- revalue(all_final$species, c("lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
#all_final$chr <- factor(all_final$chr, levels=c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202"))
all_final$chr <- factor(all_final$chr, levels=c("Lactobacillus_phage_2" , "Lactobacillus_phage_1" ,"CP046133", "CP046132","Streptococcus_phage_2","Streptococcus_phage_1","CP046135","CP046131","CP046134"))
# all_colours
all_colours_new <- c("#36E37A","#C5F6FA","#6CF5A3","#36E37B","orange","darkorange","#FAA0A0", "#10B552","#EB4D4D")
all_colours_new <- c("#dbece1","#a0cbd2","#6bf5a2","#66c264","#ffa300","#ff8a00","#ff5200", "#10B552","#EB4D4D")
all_colours_new <- c("#a0cbd2","#ffa300","#ff8a00", "#10B552","#EB4D4D")
# c("#C5F6FA","#6CF5A3","#36E37B", "#10B552","darkorange","#FAA0A0","#EB4D4D")
# c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202")
PrelAbundance <- ggplot( data = all_final,aes(y = percent_coverage, x = sample, group=interaction(chr),fill = chr))+ geom_bar( stat="identity")+
labs("",
x="",
y="relative abundance")+
theme_classic()+
# scale_color_viridis(discrete=TRUE)+
scale_fill_manual(values=all_colours_new)+
# scale_fill_viridis(discrete=TRUE)+
theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
# axis.text.x = element_blank(),
legend.position="right",
#legend.justification=c(1,1), legend.position=c(1,1),
legend.title = element_blank()
)
PrelAbundance
svg("../03_results//relative_abundance.svg",width=4.5,height=3)
# # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
#
PrelAbundance
#
dev.off()
#
# svg("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.svg",width=7,height=4)
# png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 2000, height = 1000,res=300)
#
# PrelAbundance
#
# dev.off()
##----------------amount phages
# all_final$genus <- ifelse(grepl("S_",all_final$species),"CP046131","Lactobacillus")
all_final_phage <- all_final %>% filter(chr %in% c("Lactobacillus_phage_1","Lactobacillus_phage_2","Streptococcus_phage_1","Streptococcus_phage_2"))
all_final_phage %>%
group_by(sample) %>%
dplyr::summarize(sum = sum(percent_coverage))
all_final_phage$median_01 <- all_final_phage$median
all_final_phage %>% filter(chr=="Lactobacillus_phage_1") %>% group_by(chr) %>%
dplyr::summarize(mean = mean(median_01),sd=sd(median_01))
all_final_phage %>%
group_by(sample) %>%
dplyr::summarize(sum = sum(median)) %>% dplyr::summarize(mean = mean(sum),sd=sd(sum))
all_final_phage %>%
group_by(sample) %>%
dplyr::summarize(sum = sum(percent_coverage)) %>% dplyr::summarize(mean = mean(sum),sd=sd(sum))
# %>%
# group_by(chr) %>%
# dplyr::summarize(min = min(sum),
# max = max(sum))
total_samples_phages <- aggregate(. ~sample, data=all_final_phage[,c("sample","median")], sum, na.rm=TRUE)
all_final_phage$total_coverage_phages <- total_samples_phages[match(all_final_phage$sample,total_samples_phages$sample),"median"]
all_final_phage$percent_coverage_phages <- 100*(all_final_phage$median/all_final_phage$total_coverage_phages)
#
all_final_phage$genus <- ifelse(grepl("S",all_final_phage$chr),"Streptococcus_phages","Lactobacillus_phages")
all_final_phage %>%
group_by(sample,chr) %>%
dplyr::summarize(sum = sum(percent_coverage_phages))
all_final_phage %>%
group_by(sample,chr) %>%
dplyr::summarize(sum = sum(percent_coverage_phages)) %>% filter(chr=="Lactobacillus_phage_1")
table(all_final_phage$chr)
# %>%
# group_by(chr) %>%
# dplyr::summarize(min = min(sum),
# max = max(sum))
all_final_phage %>%
group_by(sample,genus) %>%
dplyr::summarize(sum = sum(percent_coverage_phages))
##----------------amount of Streptococci and Lactobacilli
all_final$genus <- ifelse(grepl("S_",all_final$species),"CP046131","Lactobacillus")
all_final_bacteria <- all_final %>% filter(chr %in% c("CP046131","CP046134"))
all_final_bacteria %>%
group_by(sample,chr) %>%
dplyr::summarize(sum = sum(percent_coverage)) %>%
group_by(chr) %>%
dplyr::summarize(min = min(sum),
max = max(sum),median=median(sum))
##----------------amount Ldelplasmid
# all_final$genus <- ifelse(grepl("S_",all_final$species),"CP046131","Lactobacillus")
all_final_plasmid <- all_final %>% filter(chr %in% c("CP046132"))
all_final_plasmid %>%
group_by(sample,chr) %>%
dplyr::summarize(sum = sum(percent_coverage)) %>%
group_by(chr) %>%
dplyr::summarize(min = min(sum),
max = max(sum))
##----------------amount of Lactobacillus delbrueckii RMK202
all_final_bacteria %>%
group_by(chr) %>%
dplyr::summarize(min = min(percent_coverage),
max = max(percent_coverage))
##----------------coverage of Lactobacillus delbrueckii RMK202
table(all_final_bacteria$species)
all_final_bacteria[which(all_final_bacteria$species=="CP046131"),]
##-------------------------------plasmid copy number
all_final_copyNUMBER <- all_final_all %>% filter(chr %in% c("CP046132","CP046131")) %>% select(sample,chr, median) %>% spread(., chr, median)
all_final_copyNUMBER$copyNUMBER <- 100*(all_final_copyNUMBER$CP046132/all_final_copyNUMBER$CP046131)
plasmidCopyNUMber <- ggplot(all_final_copyNUMBER,aes(x=sample,y=copyNUMBER))+geom_point()+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+coord_trans( y="log2")+labs(x="",y="plasmid copy number\n[%]")
plasmidCopyNUMber
svg("../03_results//plasmidCopyNUMBER.svg",width=4.5,height=3)
# png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
plasmidCopyNUMber
dev.off()
all_final_copyNUMBER %>%
dplyr::summarize(sum = mean(copyNUMBER),sd=sd(copyNUMBER))
mean(all_final_copyNUMBER$copyNUMBER)
sd(all_final_copyNUMBER$copyNUMBER)
##----------------copy number of phages
all_final %>% select(c("sample","chr","median")) %>% spread(., chr, median)
all_final_copyNUMBER_phage <- all_final %>% select(c("sample","chr","median")) %>% filter(chr %in% c("CP046134","Streptococcus_phage_1","Streptococcus_phage_2")) %>% spread(., chr, median)
all_final_copyNUMBER_phage$Streptococcus_phages <- all_final_copyNUMBER_phage$Streptococcus_phage_2+all_final_copyNUMBER_phage$Streptococcus_phage_1
all_final_copyNUMBER_phage$copyNUMBER <- 100*(all_final_copyNUMBER_phage$Streptococcus_phages/all_final_copyNUMBER_phage$CP046134)
phageCopyNUMber <- ggplot(all_final_copyNUMBER_phage,aes(x=sample,y=copyNUMBER))+geom_point()+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+coord_trans( y="log2")+labs(x="",y="Sterm phage copy number\n[%]")
phageCopyNUMber
svg("../03_results//phageCopyNUMBER.svg",width=4.5,height=3)
# png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
phageCopyNUMber
dev.off()
all_final_copyNUMBER_phage %>%
dplyr::summarize(sum = mean(copyNUMBER),sd=sd(copyNUMBER))
mean(all_final_copyNUMBER_phage$copyNUMBER)
sd(all_final_copyNUMBER_phage$copyNUMBER)
min(all_final_copyNUMBER_phage$copyNUMBER/100)
max(all_final_copyNUMBER_phage$copyNUMBER/100)
##----------------copy number of lacto phages
all_final %>% select(c("sample","chr","median")) %>% spread(., chr, median)
all_final_copyNUMBER_phage_LACTO <- all_final %>% select(c("sample","chr","median")) %>% filter(chr %in% c("CP046131","Lactobacillus_phage_1")) %>% spread(., chr, median) %>% filter(sample!="working\nstock")
all_final_copyNUMBER_phage_LACTO$copyNUMBER <- (all_final_copyNUMBER_phage_LACTO$Lactobacillus_phage_1/all_final_copyNUMBER_phage_LACTO$CP046131)
phage_lactoCopyNUMber <- ggplot(all_final_copyNUMBER_phage_LACTO,aes(x=sample,y=copyNUMBER))+geom_point()+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+coord_trans( y="log2")+labs(x="",y="Sterm phage copy number\n[%]")
phage_lactoCopyNUMber
mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
svg("../03_results//phageCopyNUMBER_lacto.svg",width=4.5,height=3)
# png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
phage_lactoCopyNUMber
dev.off()
all_final_copyNUMBER_phage_LACTO %>%
dplyr::summarize(sum = mean(copyNUMBER),sd=sd(copyNUMBER))
mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
min(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
max(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
##----------------------normalise with actual mapping percent--------------------------------
# library(readr)
#
# mappings <- read_delim("~/Desktop/Projects/2019_Pilotplan/02_mapping2referenceDB/mappings_final.txt", "\t", escape_double = FALSE, col_names = c("sample","type","mapping"), trim_ws = TRUE) %>% filter(type=="onlyMeta") %>% select(add=-type)
# mappings$sample <- revalue(mappings$sample, c("L71"="mst4","lyo202_96"="Lyo 1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
# table(all_final$sample)
# table(mappings$sample)
#
# all_final_normalised <- merge(all_final,mappings,by.x ="sample",by.y="sample" )
# table(all_final_normalised$sample)
#
# revalue(all_final_normalised$chr,c()
#
# table(all_final_normalised$sample)
# all_final_normalised$percent_coverage_normalised <- all_final_normalised$percent_coverage*((all_final_normalised$mapping/100))
# PrelAbundance <- ggplot( data = all_final_normalised,aes(y = percent_coverage_normalised, x = sample, group=interaction(chr),fill = chr))+ geom_bar( stat="identity")+
# labs("",
# x="",
# y="relative abundance")+
# theme_classic()+
# geom_hline(yintercept=100, linetype="dashed", color = "grey")+
# # scale_color_viridis(discrete=TRUE)+
# scale_fill_manual(values=all_colours_new)+
# # scale_fill_viridis(discrete=TRUE)+
# theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
# # axis.text.x = element_blank(),
# legend.position="right",
# #legend.justification=c(1,1), legend.position=c(1,1),
# legend.title = element_blank()
# )
#
# PrelAbundance
# svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/relative_abundance.svg",width=4.5,height=3)
# # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
#
# PrelAbundance
#
# dev.off()
# #
##===============================
##only Bacteria
##===============================
all_final_bacteria
total_samples_sumTreatment <- aggregate(. ~sample, data=all_final_bacteria[,c("sample","median")], sum, na.rm=TRUE)
all_final_bacteria$total_coverage <- total_samples_sumTreatment[match(all_final_bacteria$sample,total_samples_sumTreatment$sample),"median"]
all_final_bacteria$percent_coverage <- 100*(all_final_bacteria$median/all_final_bacteria$total_coverage)
# all_final$chr <- factor(all_final$chr, levels=c("Lactobacillus_phage_2" , "Lactobacillus_phage_1" ,"CP046133", "CP046132","Streptococcus_phage_2","Streptococcus_phage_1","CP046135","CP046131","CP046134"))
# all_colours
# all_colours_new <- c("#36E37A","#C5F6FA","#6CF5A3","#36E37B","orange","darkorange","#FAA0A0", "#10B552","#EB4D4D")
all_colours_new <- c( "#10B552","#EB4D4D")
# c("#C5F6FA","#6CF5A3","#36E37B", "#10B552","darkorange","#FAA0A0","#EB4D4D")
# c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202")
PrelAbundance_bacteria <- ggplot( data = all_final_bacteria,aes(y = percent_coverage, x = sample, group=interaction(chr),fill = chr))+ geom_bar( stat="identity")+
labs("",
x="",
y="relative abundance")+
theme_classic()+
# scale_color_viridis(discrete=TRUE)+
scale_fill_manual(values=all_colours_new)+
# scale_fill_viridis(discrete=TRUE)+
theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
# axis.text.x = element_blank(),
legend.position="right",
#legend.justification=c(1,1), legend.position=c(1,1),
legend.title = element_blank()
)
library(patchwork)
PrelAbundance_bacteria
svg("../03_results//relative_abundance_all.svg",width=10,height=4.5)
# # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
#
(PrelAbundance+theme(legend.position = "none"))+(PrelAbundance_bacteria+theme(legend.position = "none"))+PrelAbundance
#
dev.off()
###----------------------low copy plasmids
all_final_allsamll <- read_count %>%
group_by(sample,chr) %>%
dplyr::summarize(median = median(geneCoverage)) %>% filter(chr %in% c("CP046131","CP046134","CP046132","CP046133","CP046135"))
all_final_copyNUMBER_phage_low <- all_final_allsamll %>% select(c("sample","chr","median")) %>% spread(., chr, median) %>% filter(sample!="working\nstock")
all_final_copyNUMBER_phage_low$copyNUMBER_2 <- (all_final_copyNUMBER_phage_low$CP046133/all_final_copyNUMBER_phage_low$CP046131)
all_final_copyNUMBER_phage_low$copyNUMBER_3 <- (all_final_copyNUMBER_phage_low$CP046135/all_final_copyNUMBER_phage_low$CP046134)
all_final_copyNUMBER_phage_low$copyNUMBER_1 <- (all_final_copyNUMBER_phage_low$CP046132/all_final_copyNUMBER_phage_low$CP046134)
all_final_copyNUMBER_phage_low %>% filter(sample=="RMK202")
# phage_lactoCopyNUMber <- ggplot(all_final_copyNUMBER_phage_LACTO,aes(x=sample,y=copyNUMBER))+geom_point()+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+coord_trans( y="log2")+labs(x="",y="Sterm phage copy number\n[%]")
#
# phage_lactoCopyNUMber
# svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/20201006/supplement/phageCopyNUMBER_lacto.svg",width=4.5,height=3)
# png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
# phage_lactoCopyNUMber
# dev.off()
mean(all_final_copyNUMBER_phage_low$copyNUMBER_2)
mean(all_final_copyNUMBER_phage_low$copyNUMBER_3)
sd(all_final_copyNUMBER_phage_low$copyNUMBER_2)
sd(all_final_copyNUMBER_phage_low$copyNUMBER_3)
mean(all_final_copyNUMBER_phage_low$copyNUMBER_1)
sd(all_final_copyNUMBER_phage_low$copyNUMBER_1)
all_final_copyNUMBER_phage_LACTO %>%
dplyr::summarize(sum = mean(copyNUMBER),sd=sd(copyNUMBER))
mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
min(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
max(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)1.6.3 Identify active Prophage reads
I try to identify active prophage reads. I want to do this by looking for paired end reads that the mates map to different contigs. The necessary columns in the Sam-file are the following:
$7 RNEXT String *|=|[:rname:∧ *=][:rname:]* Reference name of the mate/next read $8 PNEXT Int [0, 231 − 1] Position of the mate/next read
RNEXT: Reference sequence name of the primary alignment of the NEXT read in the template. For the last read, the next read is the first read in the template. If @SQ header lines are present, RNEXT (if not ‘’ or ‘=’) must be present in one of the SQ-SN tag. This field is set as ‘’ when the information is unavailable, and set as ‘=’ if RNEXT is identical RNAME. If not ‘=’ and the next read in the template has one primary mapping (see also bit 0x100 in FLAG), this field is identical to RNAME at the primary line of the next read. If RNEXT is ‘*’, no assumptions can be made on PNEXT and bit 0x20.
PNEXT: 1-based Position of the primary alignment of the NEXT read in the template. Set as 0 when the information is unavailable. This field equals POS at the primary line of the next read. If PNEXT is 0, no assumptions can be made on RNEXT and bit 0x20.
Obviously, I need to first extract all reads that map to the phages.
first masked the CRISPR ARRAYS and append the lineages genomes to the MAGs
## mask genomes
species=Sterm
rm ${BaseLocation}/CRISPRspacerBLAST/blast/all_Genomes_masked.fna
for genomes in $(echo "202-SMAG 202-S50 202-S72")
do
echo $genomes
grep "repeat" /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/${genomes}/PROKKA_*.gff |cut -f 1,4,5 > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/${genomes}/CRISPRrepeat_${genomes}.bed
bedtools maskfasta -fi /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/FNA_all/S_O_202_13496.fna -bed /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/${genomes}/CRISPRrepeat_${genomes}.bed -fo /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes/strains/${species}/renamedContigs/StartAligned/StartAligned//${genomes}_CRISPRmasked.fna
cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes/strains/${species}/renamedContigs/StartAligned/StartAligned//${genomes}_CRISPRmasked.fna >> ${BaseLocation}/CRISPRspacerBLAST/blast/all_Genomes_masked.fna
done
##change name
sed 's/CP046134/S_genome_lineage1/g' \
/archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta > \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta
sed 's/contig_1/S_genome_lineage4/g' \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//S_O_202_13496*.fna >> \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta
sed 's/S_O_202_24740_c1/S_genome_lineage3/g' \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//S_O_202_24740*.fna >> \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta
sed 's/contig_1/S_genome_lineage2/g' \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//S_O_202_13494*.fna >> \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta
grep ">" /work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta
###================
##description file
###================
#samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt ##the file with all sample names
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples_withEvolution.txt
threads=37
logFilelocation=/archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping//01_log
BaseLocation=/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/maskedReferenceMapping/
Assembly=/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta
names=G4_6_18
##--------------------------------------------------------------------------------------------------------------------------------------
##-----------make complete reference with CRISPR masked genomes
##--------------------------------------------------------------------------------------------------------------------------------------
rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/
sed 's/202-SMAG-1/CP046134/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CRISPR_region_tree/Sterm/maskedGenomes/202-SMAG_CRISPRmasked_extended.fna > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta
sed 's/202-LMAG-1/CP046131/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes/strains/Ldel/renamedContigs/StartAligned/StartAligned//202-LMAG_CRISPRmasked_extended.fna >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta
cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/startaligned/Streptococcus_phage_1_startAligned_Final.fasta >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta
cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/startaligned/Streptococcus_phage_2_startAligned_Final.fasta >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta
for remainsss in $(grep ">" /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta |sed 's/>//g' |grep "CP046134" -v |grep -v "CP046131"|grep -v "Streptococcus_phage_")
do
samtools faidx /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta ${remainsss} >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta
done
###================
##description file
###================
~/apps/PilerCR/pilercr1.06/pilercr -in ${Assembly} \
-out ${Assembly}_pilarTest -noinfo\
-seq ${Assembly}_pilarTest.fasta
##============
##mapping to reference
##============
bwa index $Assembly
# name_folder=02_againstpolished_single_rmk202_final_kneaddata_withStrains
mkdir -p ${logFilelocation}
num=1
for names in $(cut -f 1 ${samplesss})
do
echo ${num}"/16 :" ${names}
num=$((num+1))
rm -r ${BaseLocation}/${names}
mkdir -p ${BaseLocation}/${names}/bwaMapping2DB/
bwa mem -t ${threads} ${Assembly} \
/archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/${names}/${names}/${names}*neaddata_paired_1.fastq /archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/${names}/${names}/${names}*neaddata_paired_2.fastq | samtools sort -@${threads} -O BAM -o ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam -
##-------------special for G4
names=G4_6_18
bwa mem -t ${threads} ${Assembly} \
/home/vincent/Projects/2020_StarterEvolution/01_data/20200929_Novogene/X204SC20090774-Z01-F001/trimm_Galore/gz/${names}/${names}-R1_val_1.fq /home/vincent/Projects/2020_StarterEvolution/01_data/20200929_Novogene/X204SC20090774-Z01-F001/trimm_Galore/gz/${names}/${names}-R2_val_2.fq | samtools sort -@${threads} -O BAM -o ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam -
samtools view ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam |awk -F "\t" '{OFS="\t"}{if($7!="=")print $0}' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam
samtools view -H ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings.sam
#rm ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam &
done
mv /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings.sam /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings_all.sam
cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/*_reads_2_mappings.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings_all.sam
samtools sort /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings_all.sam |samtools view -S -b - > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings_all.bam
##--------------------------------------------------------------------------------------------------------------------------------------
##-----------extract reads that map to two different contigs
##--------------------------------------------------------------------------------------------------------------------------------------
#mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/
#for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt)
for names in $(cut -f 1 ${samplesss})
do
echo "----------------------"
echo ${names}
##------include only reads with soft or hard clipping--> must lie on the inseration site
grep "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |wc -l
#grep "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |awk -F "\t" '{OFS="\t"}{if($12=="NM:i:0"||$12=="NM:i:1")print $0}' |awk -F "\t" '{OFS="\t"}{if($6~"S"||$6~"H")print $0}' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam
##------inlcude only reads that mate is also soft or hard clipped (if both ends are--> potential CRISPR array)
grep "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |awk -F "\t" '{OFS="\t"}{if($12=="NM:i:0"||$12=="NM:i:1")print $0}' |awk -F "\t" '{OFS="\t"}{if($6~"S"||$6~"H")print $0}'|awk -F "\t" '{OFS="\t"}{if($14~"S"||$14~"H")print $0}' |grep "H[0-9]\{1,3\}M[0-9]\{1,3\}H" -v |grep "S[0-9]\{1,3\}M[0-9]\{1,3\}S" -v > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam
less /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam | wc -l
done
##--------------------------------------------------------------------------------------------------------------------------------------
##-----------extract reads that map to two different contigs
##--------------------------------------------------------------------------------------------------------------------------------------
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/
#for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt)
for names in $(cut -f 1 ${samplesss} )
do
echo ${names}
echo "-----------------"
echo "number of phage reads:"
grep -c "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam
echo "S. phage and lactobacillus genome reads:"
grep "Streptococcus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "CP046131"
echo "S. phage and Streptococcus genome reads:"
#grep "Streptococcus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "CP046134"
grep "Streptococcus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "S_gen"
echo "L. phage and lactobacillus genome reads:"
grep "Lactobacillus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "CP046131"
echo "L. phageand Streptococcus genome reads:"
#grep "Lactobacillus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "CP046134"
grep "Lactobacillus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "S.gen"
#grep "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |cut -f 7 |sort |uniq -c
#grep "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |cut -f 3 |sort |uniq -c
echo "=================================================================="
done
##--------------------------------------------------------------------------------------------------------------------------------------
##-----------creat circos file for mapping
##--------------------------------------------------------------------------------------------------------------------------------------
#awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($5>1 && $5<7) print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam
rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/
#for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt)
for names in $(cut -f 1 ${samplesss} )
do
echo ${names}
echo "-----------------"
#awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($3~"_phage_")print $3,$4,$4+length($10),$7,$8,$8+length($10),samplesss}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt
#awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($7~"_phage_")print $7,$8,$8+length($10),$3,$4,$4+length($10),samplesss}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt
##-------------------no multimapping
##I think mapq>15 is the cutoff for only single mappings
awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($3~"_phage_" && $5>15) print $3,$4,$4+length($10),$7,$8,$8+length($10),samplesss,$5}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt
awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($7~"_phage_" && $5>15)print $7,$8,$8+length($10),$3,$4,$4+length($10),samplesss,$5}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt
awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($3~"_phage_" && $5>15) print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.sam
awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($7~"_phage_" && $5>15)print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.sam
echo "=================================================================="
done
wc -l /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt
awk -F "\t" '{OFS="\t"}{if($4!~"_phage_")print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt | awk -F "\t" '{OFS="\t"}{if($7!="")print $0}'> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt
wc -l /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt
cut -f 7 /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt|sort|uniq -c
cut -f 7 /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt|sort|uniq -c
cut -f 4 /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt |sort|uniq -c
grep "S_gen" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.sam > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_Sterm.sam
###--------------------------make bedfile for coverage
for names in $(cut -f 1 ${samplesss} )
do
echo ${names}
echo "-----------------"
awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($3~"_phage_")print $3,$4,$4+length($10),samplesss"\n"$7,$8,$8+length($10),samplesss}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.bed
awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($7~"_phage_")print $7,$8,$8+length($10),$3,$4,$4+length($10),samplesss}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt
echo "=================================================================="
done
seq_length.py ${Assembly} |cut -f 1,3 |sort > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/genome.bed
rm /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/*llGenomes_mapping_2_genomes_cleaned_*_coverage.bed
for names in $(cut -f 1 ${samplesss} )
do
echo ${names}
echo "-----------------"
awk -F "\t" -v samplezzz="$names" '{OFS="\t"}{if($1~"Streptococcus_phage"&& $7==samplezzz) print $1,$2,$3,$7"\n"$4,$5,$6,$7}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt |bedtools sort > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages.bed
awk -F "\t" -v samplezzz="$names" '{OFS="\t"}{if($1~"Streptococcus_phage"&& $7==samplezzz) print "Streptococcus_phage_1",$2,$3,$7"\n"$4,$5,$6,$7}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt |bedtools sort > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages_bothtoegther.bed
awk -F "\t" -v samplezzz="$names" '{OFS="\t"}{if($1=="Lactobacillus_phage_1"&& $7==samplezzz) print $1,$2,$3,$7"\n"$4,$5,$6,$7}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt |bedtools sort > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_LdelPhages.bed
##----coverage
bedtools genomecov -i /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages.bed -d -g /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/genome.bed |awk -v samplezzz="$names" -F "\t" '{OFS="\t"}{if($3!=0) print $0,samplezzz}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages_coverage.bed
bedtools genomecov -i /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages_bothtoegther.bed -d -g /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/genome.bed |awk -v samplezzz="$names" -F "\t" '{OFS="\t"}{if($3!=0) print $0,samplezzz}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages_bothtoegther_coverage.bed
bedtools genomecov -i /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_LdelPhages.bed -d -g /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/genome.bed |awk -v samplezzz="$names" -F "\t" '{OFS="\t"}{if($3!=0) print $0,samplezzz}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_LdelPhages_coverage.bed
echo "=================================================================="
donelibrary(readr)
library(tidyverse)
allGenomes_mapping_2_genomes <- read_delim("../data_zenodo/non_genomic_data//allGenomes_mapping_2_genomes_cleaned.txt", "\t", escape_double = FALSE, col_names = c("phageGenome","phageStart","phageEnd","SecondGenome","SecondStart","SecondEnd","sample","MAPQ"),trim_ws = TRUE)
ggplot(allGenomes_mapping_2_genomes,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_wrap(~SecondGenome+sample,scales = "free")
ggplot(allGenomes_mapping_2_genomes,aes(x=phageStart,fill=SecondGenome,color=SecondGenome))+geom_density(alpha=0.5)+theme_classic()+facet_wrap(~phageGenome,scales = "free")
allGenomes_mapping_2_genomes$SecondStart
ggplot(allGenomes_mapping_2_genomes,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_density(alpha=0.5)+theme_classic()+facet_wrap(~SecondGenome+phageGenome,scales = "free")
ggplot(allGenomes_mapping_2_genomes,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")
ggplot(allGenomes_mapping_2_genomes,aes(x=phageStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")
##------------------------
###only streptococci
##------------------------
table(allGenomes_mapping_2_genomes$SecondGenome)
allGenomes_mapping_2_genomes_sterm <- allGenomes_mapping_2_genomes %>% filter(phageGenome %in% c("Streptococcus_phage_1","Streptococcus_phage_2")) %>% filter(SecondGenome!="CP046131") %>% filter(SecondGenome!="CP046132")#%>% filter(SecondGenome=="CP046134") #%>% filter(phageEnd-phageStart>100) %>% filter(SecondEnd-SecondStart>100)
table(allGenomes_mapping_2_genomes_sterm$SecondGenome)
allGenomes_mapping_2_genomes_sterm$phageDist <- allGenomes_mapping_2_genomes_sterm$phageEnd -allGenomes_mapping_2_genomes_sterm$phageStart
ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=phageDist))+geom_histogram()+theme_classic()
# ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=phageStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")
ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_wrap(~SecondGenome+sample,scales = "free_y")
ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")
##maybe different insertion location per phage and per lineage
ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=interaction(phageGenome,SecondGenome),color=interaction(phageGenome,SecondGenome)))+geom_histogram(alpha=0.5)+theme_classic()
ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=SecondGenome))+geom_histogram(alpha=0.5)+theme_classic()
##maybe some samples are weird
ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=SecondGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_wrap(~sample,scales = "free_y")
allGenomes_mapping_3_genomes_sterm <- allGenomes_mapping_2_genomes_sterm %>% filter(!sample %in% c("RMK202","Konserve_202","Versand_202"))
p2 <- ggplot(allGenomes_mapping_3_genomes_sterm,aes(x=SecondStart,fill=SecondGenome,color=SecondGenome))+geom_histogram(alpha=0.5,bins=100)+theme_classic()+lims(x=c(0,2000000))+labs(x="genomic location")+theme(legend.title = element_blank())
p2
png("../03_results//mapping_location_phages_unique.png", width = 1900, height = 1200,res=300)
p2
dev.off()
p2 <- ggplot(allGenomes_mapping_3_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=SecondGenome))+geom_histogram(alpha=0.5,bins=100)+theme_classic()+lims(x=c(0,2000000))
library(patchwork)
p2 / att_plot
att_plot
library(plotly)
ggp <- ggplotly(p2)
ggplot(allGenomes_mapping_3_genomes_sterm,aes(x=SecondStart,y=phageStart,fill=phageGenome,color=phageGenome))+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")+geom_point(alpha=0.5)+theme_classic()
integrationSite <- ggplot(allGenomes_mapping_3_genomes_sterm,aes(x=SecondStart,y=phageStart))+stat_density_2d(aes(fill = ..level..), geom = "polygon")+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")+labs(x="location of mate mapping on S. thermophilus genome",y="location of mate mapping on phage genome")+theme(legend.position = "none")
integrationSite
integrationSite <- ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,y=phageStart))+stat_density_2d(aes(fill = ..level..), geom = "polygon")+theme_classic()+labs(x="location of phage integration\non S. thermophilus genome",y="location of phage integration\non phage genome")+theme(legend.position = "right")#+geom_hline(yintercept=c(7500,29000,35000))
integrationSite
###=====================================
##number and percent of read supporting the integration
###=====================================
##---------------------------------newappraoch with coverage
allGenomes_mapping_2_genomes_coverage <- read_delim("../data_zenodo/non_genomic_data//allGenomes_mapping_2_genomes_cleaned_StermPhages_bothtoegther_coverage.bed", "\t", escape_double = FALSE, col_names = c("Genome","position","coverage","sample"),trim_ws = TRUE)
# ggplot(allGenomes_mapping_2_genomes_coverage,aes(x=position,y=coverage,color=sample))+geom_bar(stat="identity",alpha=0.5)+theme_classic()+facet_wrap(~Genome+sample,scales = "free")
summary_mapping <- allGenomes_mapping_2_genomes_coverage %>% group_by(interaction(sample,Genome)) %>%
dplyr::summarise(max=max(coverage))
summary_mapping$sample <- str_split_fixed(summary_mapping$`interaction(sample, Genome)`, fixed("."), 2)[,1]
summary_mapping$chr <- str_split_fixed(summary_mapping$`interaction(sample, Genome)`, fixed("."), 2)[,2]
summary_mapping <- summary_mapping %>% filter(chr=="CP046134") %>% select(c(-`interaction(sample, Genome)`,-"chr"))
colnames(summary_mapping) <- plyr::revalue(colnames(summary_mapping), c("max"="phageCoverage"))
###-------------------------------coverage genome
read_count <- read_delim("../data_zenodo/non_genomic_data//Coverage_bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)
table(read_count$chr)
# read_count$species <- ifelse(read_count$chr=="L_del_phage_01","L_del_phage_01","S_term_phage_01")
table(read_count$chr)
read_count$geneCoverage <- (read_count$count*600)/read_count$geneLength
# ggplot(read_count,aes(y=geneCoverage,group=sort,color=sort,fill=sort))+geom_boxplot()+facet_grid(sample~species, scales="free")
library(dplyr)
all_final <- read_count %>%
group_by(sample,chr) %>%
dplyr::summarize(BacteriaCoverage = median(geneCoverage)) %>% filter(chr %in% c("CP046134","Streptococcus_phage_1","Streptococcus_phage_2")) %>% spread(., chr, BacteriaCoverage)
final_phage <- merge(all_final,summary_mapping,by="sample") %>% filter(!sample %in% c("di_K2_6h","th_K2_8h"))
final_phage$percent_Prophage <- 100*(final_phage$phageCoverage/final_phage$CP046134)
final_phage$abundance_phage1 <- 100*(final_phage$Streptococcus_phage_1/final_phage$CP046134)
final_phage$abundance_phage2 <- 100*(final_phage$Streptococcus_phage_2/final_phage$CP046134)
final_phage$sample <- plyr::revalue(final_phage$sample, c("lyo202_96"="Lyo 1996","Lyo_202_2012"="Lyo 2012", "Lyo_202_2014"="Lyo 2014", "Konserve_202"="working stock", "RMK202"="starter culture 2012", "Versand_202"="starter culture 2018", "di_K2_6h"="cheesemaking day1","th_K2_8h"="cheesemaking day2","G1_6_18"="experiment_A","G2_6_18"="experiment_B","G3_6_18"="experiment_C","G4_6_18"="experiment_D","G5_6_18"="experiment_E"))
final_phage$sample = factor(final_phage$sample, levels=c("Lyo 1996","Lyo 2012","Lyo 2014","working stock","starter culture 2012","starter culture 2018","cheesemaking day1","cheesemaking day2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))
NUMpLOT <- ggplot(final_phage,aes(x=sample,percent_Prophage))+geom_bar(stat="identity")+theme_classic()+lims(y=c(0,15))+labs(x="",y="Percent of S. thermophilus\nwith prophage")+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+theme(axis.text.x = element_blank())
NUMpLOT
final_phage$percent_Prophage_of_phages <- 100*(final_phage$phageCoverage/(final_phage$Streptococcus_phage_2+final_phage$Streptococcus_phage_1))
NUMpLOT_phage <- ggplot(final_phage,aes(x=sample,percent_Prophage_of_phages))+geom_bar(stat="identity")+theme_classic()+lims(y=c(0,15))+labs(x="",y="Percent of S. phage\nwhich is inserted")+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+theme(axis.text.x = element_blank())
NUMpLOT_phage
final_phage %>% filter(chr=="Lactobacillus_phage_1") %>% group_by(chr) %>%
dplyr::summarize(mean = mean(median_01),sd=sd(median_01))
min(final_phage$percent_Prophage)
max(final_phage$percent_Prophage)
min(final_phage$percent_Prophage_of_phages)
max(final_phage$percent_Prophage_of_phages)
##----------
final_phage_long_phage <- final_phage %>% gather(., species, percent,c("abundance_phage1","abundance_phage2"), factor_key=TRUE,na.rm = TRUE)
# Biolog_long <- gather(Biolog_all_plates, well, intensity, A01:H12, factor_key=TRUE,na.rm = TRUE)
# final_phage$abundance_phage_both <- final_phage$abundance_phage2+final_phage$abundance_phage1
ggplot(final_phage_long_phage,aes(x=sample,y=percent,fill=species))+geom_bar(stat="identity")+theme_classic()+labs(x="",y="Percent of S. thermophilus MAG\nwith putative prophage")+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))
##----------
##final plot
##----------
library(patchwork)
integrationSite+NUMpLOT + plot_layout(nrow=2,heights = c(2, 1)) #
svg("../03_results/IntegrationProphage.svg",width=6.5,height=11.5)
integrationSite+NUMpLOT+NUMpLOT_phage + plot_layout(nrow=3,heights = c(2, 1,1)) #
dev.off()
# svg("~/Desktop/Projects/2019_RMK202_analysis/plot/IntegrationProphage.svg",width=6.5,height=5.5)
# NUMpLOT_phage+NUMpLOT + plot_layout(nrow=2) #
# dev.off() 1.6.4 Spacer origin
###-----------------------
##grouping
###-----------------------
groupingRMK202Strains <- data.frame(strain=c("13492","13491","24854","24837","13500","13493","13498","SMAG","24853","13494","24855","S72","13499c1","13499","13499c2","24838","24840","S50","13497","24839","13495","13496"),group=c(rep("lineage 1",9),rep("lineage 2",3),rep("lineage 3",6),rep("lineage 4",4)),colors=c(rep("#0000FF",9),rep("#6699FF",3),rep("#99CCFF",6),rep("#00FFFF",4)))
write.table(groupingRMK202Strains, "../03_results//ReferenceGenomeGrouping.txt",na = "", quote = FALSE, sep = "\t",row.names = FALSE, col.names =FALSE)
#error with wrong number as genome labels e.g. 24839 instead of the right 24739
#ReferenceGenomeGrouping <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/00_FINAL/log/ReferenceGenomeGrouping.txt<", "\t", escape_double = FALSE, col_names = c("strain","lineage","colors"), trim_ws = TRUE)
ReferenceGenomeGrouping <- read_delim("../data_zenodo/non_genomic_data//ReferenceGenomeGrouping2.txt", "\t", escape_double = FALSE, col_names = c("strain","lineage","colors"), trim_ws = TRUE)
##=========================
##new
##-------------spacer count DADA over metagenomic samples
dada_spacer_count <- read_delim("../data_zenodo/non_genomic_data/dada_spacer_count_sterm.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
##=========================
##old
##the old analysis is pretty cool because I can distinguish between protospacer and spacer mapping
###-------------------------
##genome coverage
###-------------------------
read_count <- read_delim("../data_zenodo/non_genomic_data/Coverage_bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)
read_count$geneCoverage <- (read_count$count*600)/read_count$geneLength
all_final_strepto <- read_count %>%
group_by(sample,chr) %>%
dplyr::summarize(median = median(geneCoverage)) %>% filter(chr %in% c("CP046134")) %>% select(-chr)
# %>% filter(chr %in% c("CP046131","CP046134","Lactobacillus_phage_1","Streptococcus_phage_1","Streptococcus_phage_2"))
##-----------------------------------------------
##-------------------------bwa spacer count normalised
##-----------------------------------------------
#CRISPR_spacer_coverage <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/CRISPR_spacer_coverage.bed", "\t", escape_double = FALSE, col_names = c("Name","startSpacer","endSpacer","ClusterName","numReads","basesCoverd","basesTotal","numcov","sample"), trim_ws = TRUE)
#reads4normalization <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/reads4normalization.txt", "\t", escape_double = FALSE, col_names = c("sample","readNumber","species"), trim_ws = TRUE) ; reads4normalization <- reads4normalization[,-3]
CRISPR_spacer_coverage <- read_delim("../data_zenodo/non_genomic_data/CRISPR_spacer_coverage_sterm.bed", "\t", escape_double = FALSE, col_names = c("Name","startSpacer","endSpacer","ClusterName","numReads","basesCoverd","basesTotal","numcov","sample"), trim_ws = TRUE)
reads4normalization <- read_delim("../data_zenodo/non_genomic_data/reads4normalization.txt", "\t", escape_double = FALSE, col_names = c("sample","readNumber","species"), trim_ws = TRUE) ; reads4normalization <- reads4normalization[,-3]
###----------------normalize with reads mapped to the CRISPR spacers
# CRISPR_spacer_coverage <- merge(CRISPR_spacer_coverage,reads4normalization,by="sample",all.x = TRUE)
# CRISPR_spacer_coverage$CPM <- 1000000*(CRISPR_spacer_coverage$numReads/CRISPR_spacer_coverage$readNumber)
CRISPR_spacer_coverage <- merge(CRISPR_spacer_coverage,all_final_strepto,by="sample",all.x = TRUE) %>% filter(sample!="G4_6_18")
CRISPR_spacer_coverage$CPM <- (CRISPR_spacer_coverage$numReads/CRISPR_spacer_coverage$median)
# table(CRISPR_spacer_coverage$sample)
# CRISPR_spacer_coverage %>% filter(!is.na(CPM)) %>% select(sample) %>% table()
CRISPR_spacer_coverage <- CRISPR_spacer_coverage %>% filter(!is.na(CPM))
# CRISPR_spacer_coverage <- CRISPR_spacer_coverage[!duplicated(CRISPR_spacer_coverage[,c("sample","ClusterName")]), ]
bwaSPACERCount_wide <- CRISPR_spacer_coverage %>% dplyr::select(sample,Name,CPM) %>% spread(.,sample,CPM) #%>% rename(spacer=Name)
##-----------------------------------------------
##-------------------------bwa protospacer count normalised
##-----------------------------------------------
CRISPR_protospacer_coverage <- read_delim("./data_zenodo/non_genomic_data/CRISPR_spacer_coverage_protospacer_sterm.bed", "\t", escape_double = FALSE, col_names = c("Name","startSpacer","endSpacer","ClusterName","numReads","basesCoverd","basesTotal","numcov","sample"), trim_ws = TRUE)
# reads4normalization <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/reads4normalization.txt", "\t", escape_double = FALSE, col_names = c("sample","readNumber","species"), trim_ws = TRUE) ; reads4normalization <- reads4normalization[,-3]
###----------------normalize with reads mapped to the CRISPR spacers
# CRISPR_protospacer_coverage <- merge(CRISPR_protospacer_coverage,reads4normalization,by="sample",all.x = TRUE)
# CRISPR_protospacer_coverage$CPM <- 1000000*(CRISPR_protospacer_coverage$numReads/CRISPR_protospacer_coverage$readNumber)
CRISPR_protospacer_coverage <- merge(CRISPR_protospacer_coverage,all_final_strepto,by="sample",all.x = TRUE) %>% filter(sample!="G4_6_18")
CRISPR_protospacer_coverage$CPM <- (CRISPR_protospacer_coverage$numReads/CRISPR_protospacer_coverage$median)
table(CRISPR_protospacer_coverage$sample)
table(table(CRISPR_protospacer_coverage$ClusterName))
CRISPR_protospacer_coverage <- CRISPR_protospacer_coverage %>% filter(!is.na(CPM))
# CRISPR_protospacer_coverage <- CRISPR_protospacer_coverage[!duplicated(CRISPR_protospacer_coverage[,c("sample","ClusterName")]), ]
# table(CRISPR_spacer_coverage_02$sample)
bwaPROTOSPACERCount_wide <- CRISPR_protospacer_coverage %>% dplyr::select(sample,Name,CPM) %>% spread(sample,CPM) #%>% rename(spacer=Name) #rename(Name="spacer")
# bwaPROTOSPACERCount_wide <- CRISPR_spacer_coverage %>% dplyr::select(sample,Name,CPM) %>% unique()%>% spread(.,Name,CPM) #%>% rename(spacer=Name) #rename(Name="spacer")
# CRISPR_spacer_coverage %>% dplyr::select(sample,Name) %>% nrow()
# CRISPR_spacer_coverage %>% dplyr::select(sample,Name) %>% unique() %>% nrow()
# CRISPR_spacer_coverage %>% dplyr::select(sample,ClusterName)
###===========================================
##spacers only in experiment
###===========================================
ncol(bwaSPACERCount_wide)
bwaSPACERCount_wide[which(rowSums(bwaSPACERCount_wide[,-c(1,grep("_6_18",colnames(bwaSPACERCount_wide)))])==0),]
dada_spacer_count
##the following spacers only have coverage in the experiment samples
onlyExperimentSpacers <- dada_spacer_count[which(rowSums(dada_spacer_count[,-c(1,grep("_6_18",colnames(dada_spacer_count)))])==0),]
onlyExperimentSpacers
# bwaPROTOSPACERCount_wide[which(rowSums(bwaPROTOSPACERCount_wide[,-c(1,grep("_6_18",colnames(bwaPROTOSPACERCount_wide)))])==0),]
##!!!!!!!!!!!!!!!sample file
sampleDF <- metasample_colors
##!!!!!!!!!!!!!!!spacer file
##-------information array assignment
metaspacer_info <- read_delim("../data_zenodo/non_genomic_data/uniqueSpacers_count_Ref_wOLDstrains_both.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
metaspacer_info$METAClusterName <- gsub(">","",metaspacer_info$METAClusterName)
spacers_info <- read_delim("../data_zenodo/non_genomic_data/spacers_info.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
##-------information spacer blast
# spacers_info_blast <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/all_differentBlastHits.txt", "\t", escape_double = FALSE, trim_ws = TRUE,col_names = c("DB","spacer","assigned")) %>% spread(., DB, assigned)
spacers_info_blast <- read_delim("../data_zenodo/non_genomic_data//all_differentBlastHits.txt", "\t", escape_double = FALSE, trim_ws = TRUE,col_names = c("DB","spacer","assigned")) %>% spread(., DB, assigned)
##-------clustering vContact blast
# Streptococcus_phage_Cluster_assignment <- read_csv("~/Desktop/Projects/2019_PhageDB/Vcontact_with_meta_new//allPhages_vContact/Streptococcus_phage_Cluster_assignment.txt",col_types = cols(.default = "c")) %>% mutate(Size = as.double(Size), Quality = as.double(Quality)) %>% select(-c(Order,Family,Genus,Quality,Type.x))
Streptococcus_phage_Cluster_assignment <- read_csv("../data_zenodo/non_genomic_data/Streptococcus_phage_Cluster_assignment.txt",col_types = cols(.default = "c")) %>% mutate(Size = as.double(Size), Quality = as.double(Quality)) %>% dplyr::select(-c(Order,Family,Genus,Quality,Type.x))
# Streptococcus_phage_Cluster_assignment[grep("rmk",Streptococcus_phage_Cluster_assignment$Genome),"Genome"]
# Streptococcus_phage_Cluster_assignment %>% filter(!is.na(Type.y)) %>% select(VC) %>% table()
# sort(table(Streptococcus_phage_Cluster_assignment$VC),decreasing=T) %>% head(n=30)
# sort(table(Streptococcus_phage_Cluster_assignment$Type.y),decreasing=T)
##!!!!!!!!!!!!!!!spacer matrix FROM DADA
# CRISPR_spacer_coverage_extended_wide <- CRISPR_spacer_coverage %>% select(sample,Name,CPM) %>% spread(sample, CPM)
library(readr)
dada_spacer_count <- read_delim("../data_zenodo/non_genomic_data//dada_spacer_count_sterm.txt","\t", escape_double = FALSE, trim_ws = TRUE)
metaspacer_info_tmp <- metaspacer_info %>% select(METAClusterName,spacer)
dada_spacer_count <- merge(dada_spacer_count,metaspacer_info_tmp,by.x="spacer",by.y="spacer",all.x = TRUE) %>% select(-spacer) #%>% rename( spacer = METAClusterName)
##-------information strains explained
# library(readr)
# uniqueSpacers_count_Ref <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/ForDADA2/uniqueSpacers_count_Ref.txt","\t", escape_double = FALSE, trim_ws = TRUE)
# uniqueSpacers_count_Ref$Num_ref_explained <- (uniqueSpacers_count_Ref$REF_mst1>0)+(uniqueSpacers_count_Ref$REF_mst2>0)+(uniqueSpacers_count_Ref$REF_RMK202>0)
# uniqueSpacers_count_Ref$RefStrain_explained <- ifelse(uniqueSpacers_count_Ref$Num_ref_explained>1,"multiple Strains",ifelse(uniqueSpacers_count_Ref$REF_RMK202>0,"Meta_RMK202",ifelse(uniqueSpacers_count_Ref$REF_mst1>0,"mst 1",ifelse(uniqueSpacers_count_Ref$REF_mst2>0,"mst2","not explained"))))
# table(uniqueSpacers_count_Ref$RefStrain_explained)
# uniqueSpacers_count_Ref_final <- uniqueSpacers_count_Ref %>% mutate(spacer = str_replace( spacerName, ">","")) %>% select(spacer,RefStrain_explained)
#----new----
# unique(spacer_Infos_Sterm_final$sample)
spacer_Infos_Sterm_final <- read_delim("../data_zenodo/non_genomic_data/spacer_Infos_Sterm_final.txt", "\t", escape_double = FALSE, col_names = c("ClusterName","numSpacers_in_CLUSTER","sample","array","spc","ARRAY","SPACER"), col_types = cols(SPACER = col_number()), trim_ws = TRUE)
# spacer_Infos_Sterm_final <- unique(spacer_Infos_Sterm_final)
# spacer_Infos_Sterm_final_tmp <- spacer_Infos_Sterm_final %>% select(ClusterName,numSpacers_in_CLUSTER)
spacer_Infos_Sterm_final_tmp <- spacer_Infos_Sterm_final %>% select(ClusterName,numSpacers_in_CLUSTER,SPACER)
spacer_Infos_Sterm_final_tmp <- spacer_Infos_Sterm_final_tmp[!duplicated(spacer_Infos_Sterm_final_tmp[-3]),]
# spacer_Infos_Sterm_final %>% filter(ClusterName=="Cluster_1")
# spacer_Infos_Sterm_final_tmp<- spacer_Infos_Sterm_final_tmp %>% filter(ClusterName=="Cluster_1")
spacer_Infos_Sterm_final_new <- spacer_Infos_Sterm_final %>% select(c(ClusterName,sample)) %>% table() %>% as.data.frame() %>% spread(sample,Freq)
spacer_Infos_Sterm_final_new <- merge(spacer_Infos_Sterm_final_new,spacer_Infos_Sterm_final_tmp,by="ClusterName") %>% unique()
# table(spacer_Infos_Sterm_final_new$numSpacers_in_CLUSTER)
# spacer_Infos_Sterm_final_new2 <- merge(spacer_Infos_Sterm_final_new,spacer_Infos_Sterm_final_tmp,by="ClusterName") %>% unique()
# sum(is.na(spacer_Infos_Sterm_final_new$SPACER))
# uniqueSpacers_count_Ref$RefStrain_explained <- ifelse(spacer_Infos_Sterm_final_new$numSpacers_in_CLUSTER>1,"multiple Strains",ifelse(uniqueSpacers_count_Ref$REF_RMK202>0,"Meta_RMK202",ifelse(uniqueSpacers_count_Ref$REF_mst1>0,"mst 1",ifelse(uniqueSpacers_count_Ref$REF_mst2>0,"mst2","not explained"))))
# range(table(spacer_Infos_Sterm_final$sample))
uniqueSpacers_count_Ref_final <- spacer_Infos_Sterm_final_new
nrow(spacer_Infos_Sterm_final_new)
##-------merge
# spacers_final <- merge(spacers_info,spacers_info_blast,by="spacer",all=TRUE)
# spacers_final$explainedBLAST <- ifelse(!is.na(spacers_final$localPHAGE),"localPHAGE",ifelse(!is.na(spacers_final$localBAC),"localBAC",ifelse(!is.na(spacers_final$phageDB),"phageDB",ifelse(!is.na(spacers_final$BactDB),"phageDB","No-match"))))
# # table(spacers_final$explainedBLAST)
# spacers_final <- merge(spacers_final,uniqueSpacers_count_Ref_final,by="spacer",all=TRUE)
# spacers_final <- merge(spacers_final,Streptococcus_phage_Cluster_assignment,by.x="phageDB",by.y="Genome",all=TRUE)
# spacers_final <- merge(spacers_final,spacer_Infos_Sterm_final_new,by.x="phageDB",by.y="Genome",all=TRUE)
##-------merge new
# sort(table(spacers_info_blast$phageDB),decreasing=TRUE)
# sort(table(spacers_info_blast$localPHAGE),decreasing=TRUE)
spacers_final <- merge(metaspacer_info,spacers_info_blast,by.x="METAClusterName",by.y="spacer",all=TRUE)
spacers_final$explainedBLAST <- ifelse(!is.na(spacers_final$localPHAGE),"localPHAGE",ifelse(!is.na(spacers_final$localBAC),"localBAC",ifelse(!is.na(spacers_final$phageDB),"phageDB",ifelse(!is.na(spacers_final$BactDB),"BactDB","No-match"))))
spacers_final[which(spacers_final$explainedBLAST=="No-match"),]
#spacers_final$explainedBLAST <- ifelse(!is.na(spacers_info_blast$localPHAGE),"localPHAGE",ifelse(!is.na(spacers_info_blast$localBAC),"localBAC",ifelse(!is.na(spacers_info_blast$phageDB),"phageDB",ifelse(!is.na(spacers_info_blast$BactDB),"BacDB","No-match"))))
table(spacers_final$explainedBLAST)
spacers_final[which(spacers_final$explainedBLAST=="BactDB"),]
sum(is.na(spacers_final$explainedBLAST))
# table(spacers_final$VC)
# dim(uniqueSpacers_count_Ref_final)
# uniqueSpacers_count_Ref_final$numSpacers_in_CLUSTER==0
# # nrow(spacers_final)
# nrow(uniqueSpacers_count_Ref_final)
# sum(is.na(spacers_final$phageDB))
# spacers_final$phageDB
spacers_final$phageDB <- revalue(spacers_final$phageDB,c("Lactococcus lactis phage BK5-T"="Lactococcus phage BK5-T","Streptococcus thermophilus bacteriophage 7201"="Streptococcus virus 7201","Streptococcus thermophilus bacteriophage Sfi19"="Streptococcus virus Sfi19","Streptococcus thermophilus temperate bacteriophage O1205"="Streptococcus virus O1205"))
spacers_final <- merge(spacers_final,uniqueSpacers_count_Ref_final,by.x="ClusterINFO",by.y="ClusterName",all=TRUE)
spacers_final <- merge(spacers_final,Streptococcus_phage_Cluster_assignment,by.x="phageDB",by.y="Genome",all.x=TRUE)
spacers_final[which(is.na(spacers_final$VC)&!is.na(spacers_final$phageDB)),]
#
# tmp <- spacers_final[which(is.na(spacers_final$VC)&!is.na(spacers_final$phageDB)),] %>% select(phageDB)
#
# for (i in 1:nrow(tmp)) { print(grep(paste0(" ",tmp[i,"phageShort"],"$"),Streptococcus_phage_Cluster_assignment$Genome))}
#
# i=9
# tmp[i,"phageDB"]
# Streptococcus_phage_Cluster_assignment[grep(paste0(" ",tmp[i,"phageShort"],"$"),Streptococcus_phage_Cluster_assignment$Genome),"Genome"]
#
# grep(tmp$phageShort,spacers_final$phageDB)
# tmp
# Streptococcus_phage_Cluster_assignment[grep("7201$",Streptococcus_phage_Cluster_assignment$Genome),]
# spacers_final <- merge(spacers_final,spacer_Infos_Sterm_final_new,by.x="ClusterINFO",by.y="ClusterName",all=TRUE)
##------------------add unique
spacers_final$RefStrain_explained <- ifelse(is.na(spacers_final$numSpacers_in_CLUSTER),"only Metagenome",ifelse(spacers_final$numSpacers_in_CLUSTER>1,"multiple Strains",ifelse(spacers_final$numSpacers_in_CLUSTER==1,"unique spacer","not explained")))
# table(spacers_final$RefStrain_explained )
# hist(spacers_final$numSpacers_in_CLUSTER)
##--------------quick analysis
table(spacers_final$VC) %>% sort()
table(spacers_final$Type.y) %>% sort()
# table(spacers_final$Type.x) %>% sort()
Streptococcus_phage_Cluster_assignment %>% filter(VC=="151_0")
Streptococcus_phage_Cluster_assignment %>% filter(VC=="92_0")
Streptococcus_phage_Cluster_assignment %>% filter(VC=="385_0")
Streptococcus_phage_Cluster_assignment %>% filter(VC=="316_1")
Streptococcus_phage_Cluster_assignment %>% filter(VC=="251_1")
# Streptococcus_phage_Cluster_assignment[grep("Javan63",Streptococcus_phage_Cluster_assignment$Genome),]
spacers_final %>% filter(!is.na(phageDB)) %>% filter(is.na(VC))
write.table(spacers_final, "../03_results//Clusters_spacers_final.txt",na = "", quote = FALSE, sep = "\t",row.names = FALSE, col.names =FALSE)
###-----------------------------------
##num spacer in cluster
###-----------------------------------
nrow(spacers_final)
table(spacers_final$numSpacers_in_CLUSTER)
sum(is.na(spacers_final$numSpacers_in_CLUSTER))
table(spacers_final$SPACER)
spacers_final$ARRAYINFO
ggplot(spacers_final,aes(x=SPACER,fill=explainedBLAST,color=explainedBLAST))+geom_bar()+facet_wrap(~ARRAYINFO)+theme_classic()
DF <- spacers_final %>% select(explainedBLAST,SPACER,ARRAYINFO) %>% filter(!is.na(SPACER))
DF$explainedBLAST = factor(DF$explainedBLAST, levels=c("No-match" ,"BactDB","localBAC" , "phageDB","localPHAGE"))
colorsssss <- rev(c("darkcyan","darkturquoise","goldenrod3","yellow","lightgray"))
spacerlocation_explained <- ggplot(DF,aes(x = SPACER, fill = explainedBLAST)) +
geom_bar(position = "fill") +labs(x="Spacer position",y="percent of spacers")+
scale_y_continuous(labels = scales::percent)+facet_wrap(~ARRAYINFO,scales = "free")+theme_classic()+scale_fill_manual(values = colorsssss)+theme(legend.position = "none")
spacerlocation_explained
svg("../03_results//spacerlocation_explained.svg",width=5,height=3)
spacerlocation_explained#
dev.off()1.6.5 protospacer/spacer
library(ggplot2)
##---------------------------
###correlation bwa and dada
##---------------------------
# head(dada_spacer_count)
# dada_ggprep <- bwaPROTOSPACERCount_wide %>% gather(.,key="sample",value="DadaCount",-Name) %>% add_column(method="protospacer")
# bwaSPACERCount_ggprep <- bwaSPACERCount_wide %>% gather(.,key="sample",value="bwaCount",-Name) %>% add_column(method="spacer")
# countComparison <- merge(dada_ggprep,bwaSPACERCount_ggprep,by.x=c("METAClusterName","sample"),by.y=c("Name","sample")) %>% select(-method.x,-method.y)
# ggplot(countComparison,aes(x=bwaCount,y=DadaCount))+geom_point()+theme_classic()
##---------------------------
###correlation protospacer and spacer
# ##---------------------------
bwaSPACERCount_ggprep <- bwaSPACERCount_wide %>% gather(.,key="sample",value="bwaCount",-Name) %>% add_column(method="bwa")
# dada_ggprep <- dada_spacer_count %>% gather(.,key="sample",value="DadaCount",-spacer) %>% add_column(method="dada")
bwaPROTOSPACERCount_ggprep <- bwaPROTOSPACERCount_wide %>% gather(.,key="sample",value="protospacer",-Name) %>% add_column(method="bwaProto")
countComparison <- merge(bwaPROTOSPACERCount_ggprep,bwaSPACERCount_ggprep,by=c("Name","sample")) %>% select(-method.x,-method.y)
countComparison_ggprep <- merge(countComparison,spacers_final,by.x="Name",by.y="METAClusterName",all=TRUE)
countComparison_ggprep$protospacer <- countComparison_ggprep$protospacer+0.1
countComparison_ggprep$bwaCount <- countComparison_ggprep$bwaCount+0.1
ggplot(countComparison_ggprep,aes(x=protospacer,y=bwaCount,color=RefStrain_explained))+geom_point()+theme_classic()+coord_trans(y="log2",x="log2")#+facet_wrap(~explainedBLAST)
ggplot(countComparison_ggprep,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+coord_trans(y="log2",x="log2")+ geom_smooth(method = "lm", fill = NA,se = TRUE)
ggplot(countComparison_ggprep,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+ geom_smooth(method = "lm", fill = NA,se = TRUE)
#+facet_wrap(~explainedBLAST)
# countComparison_ggprep$explainedBLAST
##---------------------------
#REMOVE low samples
##---------------------------
table(countComparison_ggprep$sample)
countComparison_ggprep_02 <- countComparison_ggprep %>% filter(!is.na(protospacer)) %>% filter(!is.na(bwaCount)) %>% filter(explainedBLAST %in% c("localPHAGE","phageDB")) %>% filter(!sample %in% c("di_K2_6h","th_K2_8h")) #%>% filter(protospacer>0.5)%>% filter(bwaCount>0.5)
table(countComparison_ggprep_02$explainedBLAST)
# countComparison_ggprep_02$explainedBLAST = factor(countComparison_ggprep_02$explainedBLAST, levels=c("localPHAGE" ))
countComparison_ggprep_02$explainedBLAST = factor(countComparison_ggprep_02$explainedBLAST, levels=c("localPHAGE" ,"phageDB"))
# colorsssss <- c("darkcyan","darkturquoise")
colorsssss <- c("red","darkturquoise")
# myplot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+ geom_smooth(method = "lm", fill = NA)
#+facet_wrap(~explainedBLAST)+coord_trans(y="log2",x="log2")
#
# myplot
# my.formula <- bwaCount ~ protospacer
my.formula <- y ~ x
library(ggpmisc)
plot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point(alpha=0.5)+theme_classic()+
# facet_wrap(~sample+localPHAGE)+
scale_fill_manual(values = colorsssss)+
scale_color_manual(values = colorsssss)+
labs(x="protospacer [copy number]",y="spacers [copy number]")+
geom_point() +
scale_y_log10(breaks=c(0.1,0.15,0.3)) +
scale_x_log10(breaks=c(0.1,1,10,30)) +
geom_smooth(method="lm",se = FALSE)+
stat_poly_eq(formula = my.formula,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE,)
plot
# svg("~/Desktop/Manuscripts/2019_RMK202/Figures/S05_rawReadMapping.svg",width=6,height=4)
png("../03_results//spacer_protospacer_regression.png", width = 2800, height = 2500,res=300)
plot
dev.off()
##--------------------
##histogramm
plot_density_x <- ggplot(countComparison_ggprep_02,aes(x=protospacer,color=explainedBLAST,fill=explainedBLAST))+geom_density(alpha=0.5)+theme_classic()+
facet_wrap(~explainedBLAST,scales="free_y",nrow=2)+
scale_fill_manual(values = colorsssss)+
scale_color_manual(values = colorsssss)+
labs(x="",y="density")+
# scale_x_log10() +
scale_x_log10(breaks=c(1,10,30)) +
# scale_x_log10(breaks=c(1,10,100,1000,10000,100000)) +
theme(legend.position = "none",axis.ticks = element_blank(),axis.text = element_blank(),axis.title.x=element_blank())
plot_density_x
#
# countComparison_ggprep_03 <- countComparison_ggprep_02 %>% filter(explainedBLAST=="localPHAGE")
#
#
# plot_density_x <- ggplot(countComparison_ggprep_03,aes(x=protospacer,color=explainedBLAST,fill=explainedBLAST))+geom_density(alpha=0.5)+theme_classic()+
# # facet_wrap(~sample)+
# scale_fill_manual(values = colorsssss)+
# scale_color_manual(values = colorsssss)+
# labs(x="",y="density")+
# # scale_x_log10() +
# scale_x_log10(breaks=c(1,10,30)) +
# # scale_x_log10(breaks=c(1,10,100,1000,10000,100000)) +
# theme(legend.position = "none",axis.ticks = element_blank(),axis.text = element_blank(),axis.title.x=element_blank())
# plot_density_x
plot_density_y <- ggplot(countComparison_ggprep_02,aes(x=bwaCount,color=explainedBLAST,fill=explainedBLAST))+geom_density(alpha=0.5)+theme_classic()+
scale_fill_manual(values = colorsssss)+
scale_color_manual(values = colorsssss)+
labs(x="",y="density")+scale_x_log10(breaks=c(0.15,0.3)) +theme(legend.position = "none",axis.ticks = element_blank(),axis.text = element_blank(),axis.title.y=element_blank())+coord_flip()
plot_density_y
###---------------------------
##make mix plot
###---------------------------
library(ggpubr)
library(patchwork)
# (plot+theme(legend.position = "none"))+plot_density_y+plot_layout(widths =c(3,1))
(plot_density_x+plot_spacer()+plot_layout(widths = c(3,1)))/((plot+theme(legend.position = "none")+plot_density_y)+plot_layout(widths =c(3,1)))+plot_layout(heights = c(2,3))
svg("../03_results//spacer_vs_proto_01.svg",width=8,height=6)
(plot+theme(legend.position = "none"))+plot_density_y+plot_layout(widths =c(3,1))
dev.off()
svg("../03_results//spacer_vs_proto_02.svg",width=6,height=8)
plot_density_x/(plot+theme(legend.position = "none"))+plot_layout(heights = c(1,3))
dev.off()
png("../03_results//spacer_vs_proto_03.png", width = 2000, height = 2000,res=300)
svg("../03_results//spacer_vs_proto_03.svg",width=5.5,height=7)
(plot_density_x+plot_spacer()+plot_layout(widths = c(3,1)))/((plot+theme(legend.position = "none")+plot_density_y)+plot_layout(widths =c(3,1)))+plot_layout(heights = c(2,3))
dev.off()
##--------------------
##ridgeplot
##--------------------
countComparison_ggprep_03 <- countComparison_ggprep_02 %>% filter(explainedBLAST!="localPHAGE") %>% filter(sample!="Versand_202") %>% filter(sample!="RMK202")
countComparison_ggprep_03 <- countComparison_ggprep_02 %>% filter(explainedBLAST=="localPHAGE") %>% filter(sample!="Versand_202") %>% filter(sample!="RMK202")
countComparison_ggprep_03 <- countComparison_ggprep_02 %>% filter(sample!="Versand_202") %>% filter(sample!="RMK202")
countComparison_ggprep_03 <- countComparison_ggprep_02 # %>% filter(explainedBLAST!="localPHAGE")
# table(countComparison_ggprep_02$explainedBLAST)
library(ggridges)
ggridgplot <- ggplot(countComparison_ggprep_03, aes(x = protospacer, y = sample, fill = stat(x))) +
facet_wrap(~localPHAGE,nrow=3,scales="free_y")+
# lims(x=c(0,30))+
scale_x_log10(breaks=c(0.1,1,10,30)) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis_c(name = "protospacers", option = "C") +
labs(x = 'protospacers [copy number]',y="")+theme(legend.position = "none")
ggridgplot
# svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/20201006/supplement/Protospacer_distribution.svg",width=6,height=6)
png("../03_results/spacer_protospacer_regression_boxplot.png", width = 2000, height = 2500,res=300)
ggridgplot
dev.off()
##--------------------
##boxpot
##--------------------
ggplot(countComparison_ggprep_02,aes(x=explainedBLAST,y=protospacer,color=explainedBLAST,fill=explainedBLAST))+geom_boxplot(alpha=0.5)
library(ggpubr)
library(patchwork)
plot_box_x <- ggplot(countComparison_ggprep_02,aes(x=explainedBLAST,group=explainedBLAST,y=protospacer,color=explainedBLAST,fill=explainedBLAST))+geom_boxplot(alpha=0.5)+theme_classic()+
labs(x="",y="protospacers [cpm]")+scale_y_log10(breaks=c(1,10,100,1000,10000,100000)) +theme(legend.position = "none")+ stat_compare_means(method = "wilcox.test")
plot_box_x
plot_box_y <- ggplot(countComparison_ggprep_02,aes(x=explainedBLAST,group=explainedBLAST,y=bwaCount,color=explainedBLAST,fill=explainedBLAST))+geom_boxplot(alpha=0.5)+theme_classic()+
labs(x="",y="spacers [cpm]")+scale_y_log10(breaks=c(1,10,100,1000,10000,100000)) +theme(legend.position = "none")+ stat_compare_means(method = "wilcox.test")
plot_box_y
plot_box_x+plot_box_y
png("../03_results/spacer_protospacer_regression_boxplot.png", width = 2000, height = 2500,res=300)
plot_box_x+plot_box_y
dev.off()
# legend_1 <- g_legend(p_mst1_sterm_woModi_clustered)
library(gridExtra)
grid.arrange(p1,p2,p3,p4, ncol=2, heights=c(1,2))
svg("../03_results//polishing_K1.svg",width=6,height=5)
grid.arrange(p1,p2,p3,p4, ncol=2, heights=c(2,3))
dev.off()
library(gridExtra)
grid.arrange(p1,p2,p3,p4, ncol=2, heights=c(1,2))
svg("../03_results//polishing_K1.svg",width=6,height=5)
grid.arrange(p1,p2,p3,p4, ncol=2, heights=c(2,3))
dev.off()
##---------------------------
#line plots
countComparison_ggprep_02 <- countComparison_ggprep %>% filter(!is.na(protospacer)) %>% filter(!is.na(bwaCount)) %>% filter(explainedBLAST %in% c("localPHAGE","phageDB")) %>% filter(!sample %in% c("di_K2_6h","th_K2_8h")) %>% filter(!is.na(localPHAGE)) #%>% filter(protospacer>0.5)%>% filter(bwaCount>0.5)
table(countComparison_ggprep_02$explainedBLAST)
countComparison_ggprep_02$explainedBLAST = factor(countComparison_ggprep_02$explainedBLAST, levels=c("localPHAGE" ,"phageDB"))
colorsssss <- c("darkcyan","darkturquoise")
# myplot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+ geom_smooth(method = "lm", fill = NA)
#+facet_wrap(~explainedBLAST)+coord_trans(y="log2",x="log2")
#
# myplot
# my.formula <- bwaCount ~ protospacer
my.formula <- y ~ x
library(ggpmisc)
plot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+
facet_wrap(~sample+localPHAGE,ncol=3)+
scale_fill_manual(values = colorsssss)+
scale_color_manual(values = colorsssss)+
labs(x="protospacer [cpm]",y="spacers [cpm]")+
geom_point() +
scale_y_log10(breaks=c(1,10,100,1000,5000)) +
scale_x_log10(breaks=c(1,10,100,1000,10000,100000)) +
geom_smooth(method="lm")+
stat_poly_eq(formula = my.formula,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE,)
plot
png("../03_results//spacer_vs_proto_04.png", width = 4000, height = 12000,res=300)
# svg("~/Desktop/Projects/2019_RMK202_analysis/plot/spacer_vs_proto_03.svg",width=6,height=6)
plot
dev.off()
p3 <- ggplot(countComparison_ggprep_02,aes(x=sample,y=protospacer,group=Name,color=localPHAGE,fill=localPHAGE))+ geom_line(size=0.5, alpha=1)+
labs("",
x="",
y="protospacer [cpm]")+
theme_classic()+
scale_x_discrete( expand = c(0, 0)) +
theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
rect = element_rect(fill = "transparent"),
legend.position="none"
)
p3
countComparison_ggprep_03 <- countComparison_ggprep_02 %>% filter(!sample %in% c("Versand_202","RMK202"))
p4 <- ggplot(countComparison_ggprep_03,aes(x=sample,y=protospacer,group=Name,color=localPHAGE,fill=localPHAGE))+ geom_line(size=0.5, alpha=1)+
labs("",
x="",
y="protospacer [cpm]")+
theme_classic()+
scale_x_discrete( expand = c(0, 0)) +
theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
rect = element_rect(fill = "transparent"),
legend.position="bottom"
)
p4
png("~../03_results//spacer_vs_proto_05.png", width = 2500, height = 12000,res=300)
# svg("~/Desktop/Projects/2019_RMK202_analysis/plot/spacer_vs_proto_03.svg",width=6,height=6)
p3 /p4
dev.off()
##---------------------------
#line plots of viral DB
##---------------------------
countComparison_ggprep_new <- countComparison_ggprep %>% filter(!is.na(protospacer)) %>% filter(!is.na(bwaCount)) %>% filter(explainedBLAST %in% c("localPHAGE","phageDB")) %>% filter(!sample %in% c("di_K2_6h","th_K2_8h")) %>% filter(is.na(localPHAGE)) #%>% filter(protospacer>0.5)%>% filter(bwaCount>0.5)
table(countComparison_ggprep_new$explainedBLAST)
# countComparison_ggprep_new$explainedBLAST = factor(countComparison_ggprep_new$explainedBLAST, levels=c("localPHAGE" ,"phageDB"))
colorsssss <- c("darkcyan","darkturquoise")
# myplot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+ geom_smooth(method = "lm", fill = NA)
#+facet_wrap(~explainedBLAST)+coord_trans(y="log2",x="log2")
#
# myplot
# my.formula <- bwaCount ~ protospacer
my.formula <- y ~ x
library(ggpmisc)
plot <- ggplot(countComparison_ggprep_new,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+
facet_wrap(~sample+localPHAGE,ncol=3)+
# scale_fill_manual(values = colorsssss)+
# scale_color_manual(values = colorsssss)+
labs(x="protospacer [cpm]",y="spacers [cpm]")+
geom_point() +
scale_y_log10(breaks=c(1,10,100,1000,5000)) +
scale_x_log10(breaks=c(1,10,100,1000,10000,100000)) +
geom_smooth(method="lm")+
stat_poly_eq(formula = my.formula,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE,)
plot
p4 <- ggplot(countComparison_ggprep_new,aes(x=sample,y=protospacer,group=Name))+ geom_line(size=0.5, alpha=1)+
labs("",
x="",
y="protospacer [cpm]")+
theme_classic()+
scale_x_discrete( expand = c(0, 0)) +
theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
rect = element_rect(fill = "transparent"),
legend.position="bottom"
)
p4